From 4d178f94ebe123d462a51169b53854cb7f198888 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 12 Apr 2015 09:14:45 -0400 Subject: x86/asm: Merge common 32-bit values in asm-offsets.c Merge common values for 32-bit native and compat. Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/1428844486-6638-1-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets.c | 19 +++++++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 15 --------------- arch/x86/kernel/asm-offsets_64.c | 21 --------------------- 3 files changed, 19 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9f6b9341950f..b27f6ec90caa 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -41,6 +41,25 @@ void common(void) { OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + BLANK(); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); + + BLANK(); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + + BLANK(); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 47703aed74cf..628bfd4c06bb 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -17,17 +17,6 @@ void foo(void); void foo(void) { - OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); - OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); - OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); - OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); - OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); - OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); - OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); - OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); - OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); - BLANK(); - OFFSET(CPUINFO_x86, cpuinfo_x86, x86); OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); @@ -37,7 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_cpu, thread_info, cpu); BLANK(); @@ -60,9 +48,6 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); - BLANK(); - OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 5ce6f2da8763..dcaab87da629 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -29,27 +29,6 @@ int main(void) BLANK(); #endif -#ifdef CONFIG_IA32_EMULATION - OFFSET(TI_sysenter_return, thread_info, sysenter_return); - BLANK(); - -#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) - ENTRY(ax); - ENTRY(bx); - ENTRY(cx); - ENTRY(dx); - ENTRY(si); - ENTRY(di); - ENTRY(bp); - ENTRY(sp); - ENTRY(ip); - BLANK(); -#undef ENTRY - - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); - BLANK(); -#endif - #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(cx); -- cgit v1.2.1 From 14434052ffb3b7fe8f491e9d0a7793376fb79155 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 12 Apr 2015 09:14:46 -0400 Subject: x86/asm: Remove unused TI_cpu Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/1428844486-6638-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_32.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 628bfd4c06bb..6ce39025f467 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -26,9 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_cpu, thread_info, cpu); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); -- cgit v1.2.1 From ff22e2010144b6aa050da35851f1fa79087cca06 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 12 Apr 2015 21:45:06 +0200 Subject: x86/asm, x86/power/hibernate: Use local labels in asm ... so that they don't appear in the object file and thus in objdump output. They're local anyway and have a meaning only within that file. No functionality change. Signed-off-by: Borislav Petkov Acked-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: H. Peter Anvin Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: linux-pm@vger.kernel.org Link: http://lkml.kernel.org/r/1428867906-12016-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/power/hibernate_asm_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 3c4469a7a929..e2386cb4e0c3 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -78,9 +78,9 @@ ENTRY(restore_image) /* code below has been relocated to a safe page */ ENTRY(core_restore_code) -loop: +.Lloop: testq %rdx, %rdx - jz done + jz .Ldone /* get addresses from the pbe and copy the page */ movq pbe_address(%rdx), %rsi @@ -91,8 +91,8 @@ loop: /* progress to the next pbe */ movq pbe_next(%rdx), %rdx - jmp loop -done: + jmp .Lloop +.Ldone: /* jump to the restore_registers address from the image header */ jmpq *%rax /* -- cgit v1.2.1 From c0f6feba784e1087b905ad097d2d9ac0aaf744a5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 Apr 2015 08:50:14 +0200 Subject: x86/asm, x86/acpi/wakeup_64.S: Make global label a local one Make it a local symbol so that it doesn't appear in objdump output. No functionality change - code remains the same, just the global label disappears: ffffffff81039dbe: bf 03 00 00 00 mov $0x3,%edi ffffffff81039dc3: 31 c0 xor %eax,%eax ffffffff81039dc5: e8 b6 fd ff ff callq ffffffff81039b80 -ffffffff81039dca: eb 00 jmp ffffffff81039dcc - -ffffffff81039dcc : +ffffffff81039dca: eb 00 jmp ffffffff81039dcc ffffffff81039dcc: 48 c7 c0 80 1a ca 82 mov $0xffffffff82ca1a80,%rax ffffffff81039dd3: 48 8b 98 e2 00 00 00 mov 0xe2(%rax),%rbx ffffffff81039dda: 0f 22 e3 mov %rbx,%cr4 Signed-off-by: Borislav Petkov Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429080614-22610-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/wakeup_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index ae693b51ed8e..8c35df468104 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel) pushfq popq pt_regs_flags(%rax) - movq $resume_point, saved_rip(%rip) + movq $.Lresume_point, saved_rip(%rip) movq %rsp, saved_rsp movq %rbp, saved_rbp @@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel) xorl %eax, %eax call x86_acpi_enter_sleep_state /* in case something went wrong, restore the machine status and go on */ - jmp resume_point + jmp .Lresume_point .align 4 -resume_point: +.Lresume_point: /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx -- cgit v1.2.1 From 6a907738ab9840ca3d71c22cd28fba4cbae7f7ce Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 15 Apr 2015 10:51:26 +0100 Subject: x86/asm: Enable fast 32-bit put_user_64() for copy_to_user() For fixed sized copies, copy_to_user() will utilize __put_user_size() fastpaths. However, it is missing the translation for 64-bit copies on x86/32. Testing on a Pinetrail Atom, the 64 bit put_user() fastpath is substantially faster than the generic copy_to_user() fallback. Signed-off-by: Chris Wilson Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Link: http://lkml.kernel.org/r/1429091486-11443-1-git-send-email-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess_32.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 3c03a5de64d3..0ed5504c6060 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -59,6 +59,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret, 4); return ret; + case 8: + __put_user_size(*(u64 *)from, (u64 __user *)to, + 8, ret, 8); + return ret; } } return __copy_to_user_ll(to, from, n); -- cgit v1.2.1 From aac82d319148c6a84e1bf90b86d3e0ec8bf0ee38 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Apr 2015 15:51:54 -0700 Subject: x86, paravirt, xen: Remove the 64-bit ->irq_enable_sysexit() pvop We don't use irq_enable_sysexit on 64-bit kernels any more. Remove all the paravirt and Xen machinery to support it on 64-bit kernels. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8a03355698fe5b94194e9e7360f19f91c1b2cf1f.1428100853.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 6 ------ arch/x86/include/asm/paravirt_types.h | 7 ++++--- arch/x86/kernel/asm-offsets.c | 2 ++ arch/x86/kernel/paravirt.c | 4 +++- arch/x86/kernel/paravirt_patch_64.c | 1 - arch/x86/xen/enlighten.c | 3 ++- arch/x86/xen/xen-asm_64.S | 16 ---------------- arch/x86/xen/xen-ops.h | 2 ++ 8 files changed, 13 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a821b1cd4fa7..3cdb9eafbf8c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -77,12 +77,6 @@ ENTRY(native_usergs_sysret32) swapgs sysretl ENDPROC(native_usergs_sysret32) - -ENTRY(native_irq_enable_sysexit) - swapgs - sti - sysexit -ENDPROC(native_irq_enable_sysexit) #endif /* diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7549b8b369e4..38a0ff9ef06e 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -160,13 +160,14 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); unsigned long long (*read_tscp)(unsigned int *aux); +#ifdef CONFIG_X86_32 /* * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) + * is only used in 32-bit kernels. 64-bit kernels use + * usergs_sysret32 instead. */ void (*irq_enable_sysexit)(void); +#endif /* * Switch to usermode gs and return to 64-bit usermode using diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index b27f6ec90caa..8e3d22a1af94 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -68,7 +68,9 @@ void common(void) { OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); +#ifdef CONFIG_X86_32 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); +#endif OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 548d25f00c90..7563114d9c3a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || +#ifdef CONFIG_X86_32 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || +#endif type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ @@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +#if defined(CONFIG_X86_32) .irq_enable_sysexit = native_irq_enable_sysexit, #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index a1da6737ba5b..0de21c62c348 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -49,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); PATCH_SITE(pv_cpu_ops, swapgs); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 81665c9f2132..3797b6b31f95 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1267,10 +1267,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_tscp = native_read_tscp, .iret = xen_iret, - .irq_enable_sysexit = xen_sysexit, #ifdef CONFIG_X86_64 .usergs_sysret32 = xen_sysret32, .usergs_sysret64 = xen_sysret64, +#else + .irq_enable_sysexit = xen_sysexit, #endif .load_tr_desc = paravirt_nop, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 985fc3ee0973..a2cabb8bd6bf 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -47,22 +47,6 @@ ENTRY(xen_iret) ENDPATCH(xen_iret) RELOC(xen_iret, 1b+1) -/* - * sysexit is not used for 64-bit processes, so it's only ever used to - * return to 32-bit compat userspace. - */ -ENTRY(xen_sysexit) - pushq $__USER32_DS - pushq %rcx - pushq $X86_EFLAGS_IF - pushq $__USER32_CS - pushq %rdx - - pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_sysexit) -RELOC(xen_sysexit, 1b+1) - ENTRY(xen_sysret64) /* * We're already on the usermode stack at this point, but diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9e195c683549..c20fe29e65f4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -134,7 +134,9 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); +#ifdef CONFIG_X86_32 __visible void xen_sysexit(void); +#endif __visible void xen_sysret32(void); __visible void xen_sysret64(void); __visible void xen_adjust_exception_frame(void); -- cgit v1.2.1 From 3462bd2adeadc49d9e126bca3b5536a3437a902d Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer Date: Mon, 20 Apr 2015 23:27:11 +0200 Subject: x86/asm: Always inline atomics During some code analysis I realized that atomic_add(), atomic_sub() and friends are not necessarily inlined AND that each function is defined multiple times: atomic_inc: 544 duplicates atomic_dec: 215 duplicates atomic_dec_and_test: 107 duplicates atomic64_inc: 38 duplicates [...] Each definition is exact equally, e.g.: ffffffff813171b8 : 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 01 3e lock add %edi,(%rsi) 5d pop %rbp c3 retq In turn each definition has one or more callsites (sure): ffffffff81317c78: e8 3b f5 ff ff callq ffffffff813171b8 [...] ffffffff8131a062: e8 51 d1 ff ff callq ffffffff813171b8 [...] ffffffff8131a190: e8 23 d0 ff ff callq ffffffff813171b8 [...] The other way around would be to remove the static linkage - but I prefer an enforced inlining here. Before: text data bss dec hex filename 81467393 19874720 20168704 121510817 73e1ba1 vmlinux.orig After: text data bss dec hex filename 81461323 19874720 20168704 121504747 73e03eb vmlinux.inlined Yes, the inlining here makes the kernel even smaller! ;) Linus further observed: "I have this memory of having seen that before - the size heuristics for gcc getting confused by inlining. [...] It might be a good idea to mark things that are basically just wrappers around a single (or a couple of) asm instruction to be always_inline." Signed-off-by: Hagen Paul Pfeifer Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429565231-4609-1-git-send-email-hagen@jauu.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic.h | 16 ++++++++-------- arch/x86/include/asm/atomic64_64.h | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 5e5cd123fdfb..75a9ee8529f3 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i) * * Atomically adds @i to @v. */ -static inline void atomic_add(int i, atomic_t *v) +static __always_inline void atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) @@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v) * * Atomically subtracts @i from @v. */ -static inline void atomic_sub(int i, atomic_t *v) +static __always_inline void atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) @@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -static inline int atomic_sub_and_test(int i, atomic_t *v) +static __always_inline int atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e"); } @@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v) * * Atomically increments @v by 1. */ -static inline void atomic_inc(atomic_t *v) +static __always_inline void atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); @@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v) * * Atomically decrements @v by 1. */ -static inline void atomic_dec(atomic_t *v) +static __always_inline void atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); @@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic_dec_and_test(atomic_t *v) +static __always_inline int atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); } @@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static inline int atomic_add_return(int i, atomic_t *v) +static __always_inline int atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } @@ -191,7 +191,7 @@ static inline int atomic_xchg(atomic_t *v, int new) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index f8d273e18516..b965f9e03f2a 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i) * * Atomically adds @i to @v. */ -static inline void atomic64_add(long i, atomic64_t *v) +static __always_inline void atomic64_add(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) @@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v) * * Atomically increments @v by 1. */ -static inline void atomic64_inc(atomic64_t *v) +static __always_inline void atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) @@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ -static inline void atomic64_dec(atomic64_t *v) +static __always_inline void atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) @@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static inline long atomic64_add_return(long i, atomic64_t *v) +static __always_inline long atomic64_add_return(long i, atomic64_t *v) { return i + xadd(&v->counter, i); } -- cgit v1.2.1 From 17be0aec74fb036eb4eb32c2268f3420a034762b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 21 Apr 2015 18:27:29 +0200 Subject: x86/asm/entry/64: Implement better check for canonical addresses This change makes the check exact (no more false positives on "negative" addresses). Andy explains: "Canonical addresses either start with 17 zeros or 17 ones. In the old code, we checked that the top (64-47) = 17 bits were all zero. We did this by shifting right by 47 bits and making sure that nothing was left. In the new code, we're shifting left by (64 - 48) = 16 bits and then signed shifting right by the same amount, this propagating the 17th highest bit to all positions to its left. If we get the same value we started with, then we're good to go." While it isn't really important to be fully correct here - almost all addresses we'll ever see will be userspace ones, but OTOH it looks to be cheap enough: the new code uses two more ALU ops but preserves %rcx, allowing to not reload it from pt_regs->cx again. On disassembly level, the changes are: cmp %rcx,0x80(%rsp) -> mov 0x80(%rsp),%r11; cmp %rcx,%r11 shr $0x2f,%rcx -> shl $0x10,%rcx; sar $0x10,%rcx; cmp %rcx,%r11 mov 0x58(%rsp),%rcx -> (eliminated) Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429633649-20169-1-git-send-email-dvlasenk@redhat.com [ Changelog massage. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7b238494b31..3c78a15a537d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -410,26 +410,27 @@ syscall_return: * a completely clean 64-bit userspace context. */ movq RCX(%rsp),%rcx - cmpq %rcx,RIP(%rsp) /* RCX == RIP */ + movq RIP(%rsp),%r11 + cmpq %rcx,%r11 /* RCX == RIP */ jne opportunistic_sysret_failed /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) + * the kernel, since userspace controls RSP. * - * If virtual addresses ever become wider, this will need + * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. */ .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed + /* Change top 16 bits to be the sign-extension of 47th bit */ + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne opportunistic_sysret_failed cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ jne opportunistic_sysret_failed @@ -466,8 +467,8 @@ syscall_return: */ syscall_return_via_sysret: CFI_REMEMBER_STATE - /* r11 is already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_R11 + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp),%rsp USERGS_SYSRET64 CFI_RESTORE_STATE -- cgit v1.2.1 From ac7f5dfb0348a33b2ea92a0c477103c4db45ad4e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 21 Apr 2015 18:03:13 +0200 Subject: x86/asm/entry/64: Merge 32-bit execve stubs with x32 ones, as they are identical Run-tested. Suggested-by: Brian Gerst Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429632194-13445-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3c78a15a537d..e952f6bf1d6d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -525,40 +525,27 @@ GLOBAL(stub_execveat) CFI_ENDPROC END(stub_execveat) -#ifdef CONFIG_X86_X32_ABI +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) +GLOBAL(stub32_execve) CFI_STARTPROC DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve CFI_ENDPROC +END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 - call compat_sys_execveat - jmp return_from_execve - CFI_ENDPROC -END(stub_x32_execveat) -#endif - -#ifdef CONFIG_IA32_EMULATION - .align 8 -GLOBAL(stub32_execve) - CFI_STARTPROC - call compat_sys_execve - jmp return_from_execve - CFI_ENDPROC -END(stub32_execve) - .align 8 GLOBAL(stub32_execveat) CFI_STARTPROC + DEFAULT_FRAME 0, 8 call compat_sys_execveat jmp return_from_execve CFI_ENDPROC END(stub32_execveat) +END(stub_x32_execveat) #endif /* -- cgit v1.2.1 From 3f5159a9221f19b08275b0a6388ab14392ae4eec Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 21 Apr 2015 18:03:14 +0200 Subject: x86/asm/entry/32: Update -ENOSYS handling to match the 64-bit logic Recently Andy changed the 64-bit syscall logic so that pt_regs->ax is initially set to -ENOSYS, and on syscall exit, it is updated with the actual return value. This simplified the logic there. This patch does the same for 32-bit syscall entry points. The check for %rax being too big is moved to be just before the call instruction which dispatches execution through the syscall table. There is no way to accidentally skip this check now by jumping to a label after it. This allows us to remove redundant checks after ptrace et al. If %rax is too big, we just skip over the (call, write %rax to pt_regs->ax) instruction pair. pt_regs->ax remains set to -ENOSYS, and it gets returned to userspace. Similar to 64-bit code, this eliminates the "ia32_badsys" code path. Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1429632194-13445-2-git-send-email-dvlasenk@redhat.com [ Changelog massage. ] Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 3cdb9eafbf8c..56fd6dd2e342 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -136,7 +136,7 @@ ENTRY(ia32_sysenter_target) pushq_cfi_reg rsi /* pt_regs->si */ pushq_cfi_reg rdx /* pt_regs->dx */ pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi_reg rax /* pt_regs->ax */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ cld sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ CFI_ADJUST_CFA_OFFSET 10*8 @@ -163,8 +163,6 @@ sysenter_flags_fixed: testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz sysenter_tracesys - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys sysenter_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -173,8 +171,11 @@ sysenter_do_call: movl %ebx,%edi /* arg1 */ movl %edx,%edx /* arg3 (zero extension) */ sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f call *ia32_sys_call_table(,%rax,8) movq %rax,RAX(%rsp) +1: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -241,9 +242,7 @@ sysexit_from_sys_call: movl %ebx,%esi /* 2nd arg: 1st syscall arg */ movl %eax,%edi /* 1st arg: syscall number */ call __audit_syscall_entry - movl RAX(%rsp),%eax /* reload syscall number */ - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys + movl ORIG_RAX(%rsp),%eax /* reload syscall number */ movl %ebx,%edi /* reload 1st syscall arg */ movl RCX(%rsp),%esi /* reload 2nd syscall arg */ movl RDX(%rsp),%edx /* reload 3rd syscall arg */ @@ -294,13 +293,10 @@ sysenter_tracesys: #endif SAVE_EXTRA_REGS CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call CFI_ENDPROC ENDPROC(ia32_sysenter_target) @@ -370,7 +366,7 @@ ENTRY(ia32_cstar_target) pushq_cfi_reg rdx /* pt_regs->dx */ pushq_cfi_reg rbp /* pt_regs->cx */ movl %ebp,%ecx - pushq_cfi_reg rax /* pt_regs->ax */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ CFI_ADJUST_CFA_OFFSET 10*8 @@ -386,8 +382,6 @@ ENTRY(ia32_cstar_target) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz cstar_tracesys - cmpq $IA32_NR_syscalls-1,%rax - ja ia32_badsys cstar_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -396,8 +390,11 @@ cstar_do_call: movl %ebx,%edi /* arg1 */ movl %edx,%edx /* arg3 (zero extension) */ cstar_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f call *ia32_sys_call_table(,%rax,8) movq %rax,RAX(%rsp) +1: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -444,14 +441,11 @@ cstar_tracesys: xchgl %r9d,%ebp SAVE_EXTRA_REGS CLEAR_RREGS r9 - movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS xchgl %ebp,%r9d - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call END(ia32_cstar_target) @@ -510,7 +504,7 @@ ENTRY(ia32_syscall) pushq_cfi_reg rsi /* pt_regs->si */ pushq_cfi_reg rdx /* pt_regs->dx */ pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi_reg rax /* pt_regs->ax */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ cld sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ CFI_ADJUST_CFA_OFFSET 10*8 @@ -518,8 +512,6 @@ ENTRY(ia32_syscall) orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_tracesys - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys ia32_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -527,9 +519,12 @@ ia32_do_call: xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ movl %ebx,%edi /* arg1 */ movl %edx,%edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: movq %rax,RAX(%rsp) +1: ia32_ret_from_sys_call: CLEAR_RREGS jmp int_ret_from_sys_call @@ -537,23 +532,14 @@ ia32_ret_from_sys_call: ia32_tracesys: SAVE_EXTRA_REGS CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call + CFI_ENDPROC END(ia32_syscall) -ia32_badsys: - movq $0,ORIG_RAX(%rsp) - movq $-ENOSYS,%rax - jmp ia32_sysret - - CFI_ENDPROC - .macro PTREGSCALL label, func ALIGN GLOBAL(\label) -- cgit v1.2.1 From 514c2304b4574dae28c3d7c0ad6b9cf296994140 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:00 +0000 Subject: x86: perf: Use hrtimer_start() hrtimer_start() does not longer defer already expired timers to the softirq. Get rid of the __hrtimer_start_range_ns() invocation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20150414203502.260487331@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 999289b94025..10190c044a7e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -204,9 +204,8 @@ again: static void rapl_start_hrtimer(struct rapl_pmu *pmu) { - __hrtimer_start_range_ns(&pmu->hrtimer, - pmu->timer_interval, 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&pmu->hrtimer, pmu->timer_interval, + HRTIMER_MODE_REL_PINNED); } static void rapl_stop_hrtimer(struct rapl_pmu *pmu) -- cgit v1.2.1 From 576b0704c9def6d54b3ae9e13b0b7567c713f568 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:01 +0000 Subject: x86: perf: uncore: Use hrtimer_start() hrtimer_start() does not longer defer already expired timers to the softirq. Get rid of the __hrtimer_start_range_ns() invocation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20150414203502.360555157@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c635b8b49e93..7c411f0e58fd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -233,9 +233,8 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { - __hrtimer_start_range_ns(&box->hrtimer, - ns_to_ktime(box->hrtimer_duration), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), + HRTIMER_MODE_REL_PINNED); } void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) -- cgit v1.2.1 From 5f0052f9522b84269e1b3b435a806f873d992702 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:23 +0800 Subject: x86/irq: Save destination CPU ID in irq_cfg Cache destination CPU APIC ID into struct irq_cfg when assigning vector for interrupt. Upper layer just needs to read the cached APIC ID instead of calling apic->cpu_mask_to_apicid_and(), it helps to hide APIC driver details from IOAPIC/HPET/MSI drivers.. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 1 + arch/x86/kernel/apic/vector.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e9571ddabc4f..cda96954cbbf 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -116,6 +116,7 @@ struct irq_data; struct irq_cfg { cpumask_var_t domain; cpumask_var_t old_domain; + unsigned int dest_apicid; u8 vector; u8 move_in_progress : 1; #ifdef CONFIG_IRQ_REMAP diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6cedd7914581..c724ef6b218c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -188,6 +188,12 @@ next: } free_cpumask_var(tmp_mask); + if (!err) { + /* cache destination APIC IDs into cfg->dest_apicid */ + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, + &cfg->dest_apicid); + } + return err; } -- cgit v1.2.1 From b5dc8e6c21e7ffba0246bf39cea97805c142bf85 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:24 +0800 Subject: x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors Abstract CPU local APIC as an interrupt controller and create an irqdomain for it to manage CPU interrupt vectors. It's the base to enable hierarchical irqdomains on x86 systems. The final irqdomain hierarchy will look like this: IOAPIC domain ----| MSI/MSI-x domain ----> [Interrupt Remapping domain] -> CPU vector domain HPET_IRQ domain ----| ^ | DMAR domain ----------------------------------------------| HT_IRQ domain ----------------------------------------------| Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Prarit Bhargava Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 3 +- arch/x86/include/asm/hw_irq.h | 17 +++++ arch/x86/kernel/apic/io_apic.c | 3 - arch/x86/kernel/apic/vector.c | 155 +++++++++++++++++++++++++++++++++++++---- 4 files changed, 160 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6049d587599e..e75a96c38c58 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -465,7 +465,6 @@ config X86_INTEL_CE select X86_REBOOTFIXUPS select OF select OF_EARLY_FLATTREE - select IRQ_DOMAIN ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -914,11 +913,11 @@ config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + select IRQ_DOMAIN_HIERARCHY config X86_IO_APIC def_bool y depends on X86_LOCAL_APIC || X86_UP_IOAPIC - select IRQ_DOMAIN config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index cda96954cbbf..5b951ac56aa1 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -112,6 +112,17 @@ struct irq_2_irte { #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; +struct irq_domain; + +struct irq_alloc_info { + u32 flags; + const struct cpumask *mask; /* CPU mask for vector allocation */ +}; + +enum { + /* Allocate contiguous CPU vectors */ + X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, +}; struct irq_cfg { cpumask_var_t domain; @@ -135,6 +146,12 @@ struct irq_cfg { }; }; +extern struct irq_domain *x86_vector_domain; + +extern void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask); +extern void copy_irq_alloc_info(struct irq_alloc_info *dst, + struct irq_alloc_info *src); extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f4dc2462a1ac..56d532106ef3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2356,9 +2356,6 @@ static int mp_irqdomain_create(int ioapic) ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1); - if (gsi_cfg->gsi_base == 0) - irq_set_default_host(ip->irqdomain); - return 0; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c724ef6b218c..6358d8d351f5 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Enable support of hierarchical irqdomains * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,7 +21,9 @@ #include #include +struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); +static struct irq_chip lapic_controller; void lock_vector_lock(void) { @@ -36,15 +40,21 @@ void unlock_vector_lock(void) struct irq_cfg *irq_cfg(unsigned int irq) { - return irq_get_chip_data(irq); + return irqd_cfg(irq_get_irq_data(irq)); } struct irq_cfg *irqd_cfg(struct irq_data *irq_data) { + if (!irq_data) + return NULL; + + while (irq_data->parent_data) + irq_data = irq_data->parent_data; + return irq_data->chip_data; } -static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) +static struct irq_cfg *alloc_irq_cfg(int node) { struct irq_cfg *cfg; @@ -79,7 +89,7 @@ struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } - cfg = alloc_irq_cfg(at, node); + cfg = alloc_irq_cfg(node); if (cfg) irq_set_chip_data(at, cfg); else @@ -87,14 +97,13 @@ struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } -static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) +static void free_irq_cfg(struct irq_cfg *cfg) { - if (!cfg) - return; - irq_set_chip_data(at, NULL); - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); + if (cfg) { + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); + } } static int @@ -241,6 +250,90 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg) raw_spin_unlock_irqrestore(&vector_lock, flags); } +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +{ + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static inline const struct cpumask * +irq_alloc_info_get_mask(struct irq_alloc_info *info) +{ + return (!info || !info->mask) ? apic->target_cpus() : info->mask; +} + +static void x86_vector_free_irqs(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irq_data && irq_data->chip_data) { + free_remapped_irq(virq); + clear_irq_vector(virq + i, irq_data->chip_data); + free_irq_cfg(irq_data->chip_data); + irq_domain_reset_irq_data(irq_data); + } + } +} + +static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + const struct cpumask *mask; + struct irq_data *irq_data; + struct irq_cfg *cfg; + int i, err; + + if (disable_apic) + return -ENXIO; + + /* Currently vector allocator can't guarantee contiguous allocations */ + if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) + return -ENOSYS; + + mask = irq_alloc_info_get_mask(info); + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irq_data); + cfg = alloc_irq_cfg(irq_data->node); + if (!cfg) { + err = -ENOMEM; + goto error; + } + + irq_data->chip = &lapic_controller; + irq_data->chip_data = cfg; + irq_data->hwirq = virq + i; + err = assign_irq_vector(virq, cfg, mask); + if (err) + goto error; + } + + return 0; + +error: + x86_vector_free_irqs(domain, virq, i + 1); + return err; +} + +static struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, +}; + int __init arch_probe_nr_irqs(void) { int nr; @@ -266,6 +359,11 @@ int __init arch_probe_nr_irqs(void) int __init arch_early_irq_init(void) { + x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, + NULL); + BUG_ON(x86_vector_domain == NULL); + irq_set_default_host(x86_vector_domain); + return arch_early_ioapic_init(); } @@ -380,6 +478,36 @@ int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, return 0; } +static int vector_set_affinity(struct irq_data *irq_data, + const struct cpumask *dest, bool force) +{ + struct irq_cfg *cfg = irq_data->chip_data; + int err, irq = irq_data->irq; + + if (!config_enabled(CONFIG_SMP)) + return -EPERM; + + if (!cpumask_intersects(dest, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, dest); + if (err) { + struct irq_data *top = irq_get_irq_data(irq); + + if (assign_irq_vector(irq, cfg, top->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + return IRQ_SET_MASK_OK; +} + +static struct irq_chip lapic_controller = { + .irq_ack = apic_ack_edge, + .irq_set_affinity = vector_set_affinity, + .irq_retrigger = apic_retrigger_irq, +}; + #ifdef CONFIG_SMP void send_cleanup_vector(struct irq_cfg *cfg) { @@ -497,7 +625,7 @@ int arch_setup_hwirq(unsigned int irq, int node) unsigned long flags; int ret; - cfg = alloc_irq_cfg(irq, node); + cfg = alloc_irq_cfg(node); if (!cfg) return -ENOMEM; @@ -508,7 +636,7 @@ int arch_setup_hwirq(unsigned int irq, int node) if (!ret) irq_set_chip_data(irq, cfg); else - free_irq_cfg(irq, cfg); + free_irq_cfg(cfg); return ret; } @@ -518,7 +646,8 @@ void arch_teardown_hwirq(unsigned int irq) free_remapped_irq(irq); clear_irq_vector(irq, cfg); - free_irq_cfg(irq, cfg); + irq_set_chip_data(irq, NULL); + free_irq_cfg(cfg); } static void __init print_APIC_field(int base) -- cgit v1.2.1 From bd8eb63f8a3907bb477992145cb6ce0064a1e43f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:25 +0800 Subject: x86/hpet: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for HPET, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Srivatsa S. Bhat Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1416894816-23245-4-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3acbff4716b0..ae29554f57ea 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -476,7 +477,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); return -EINVAL; } return 0; @@ -484,9 +485,10 @@ static int hpet_setup_msi_irq(unsigned int irq) static int hpet_assign_irq(struct hpet_dev *dev) { - unsigned int irq = irq_alloc_hwirq(-1); + int irq; - if (!irq) + irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + if (irq <= 0) return -EINVAL; irq_set_handler_data(irq, dev); -- cgit v1.2.1 From 4c8f9960ee497020d0858362c81ece984bc89aa5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:26 +0800 Subject: x86/MSI: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for PCI MSI, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1416894816-23245-5-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d6ba2d660dc5..76cc2c902176 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -146,23 +147,20 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { struct msi_desc *msidesc; - unsigned int irq; - int node, ret; + int irq, ret; /* Multiple MSI vectors only supported with interrupt remapping */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; - node = dev_to_node(&dev->dev); - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_alloc_hwirq(node); - if (!irq) + irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + if (irq <= 0) return -ENOSPC; ret = setup_msi_irq(dev, msidesc, irq, 0); if (ret < 0) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); return ret; } @@ -172,7 +170,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) void native_teardown_msi_irq(unsigned int irq) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); } #ifdef CONFIG_DMAR_TABLE -- cgit v1.2.1 From 331dd19eee243e1b7e670c5993609121817afeaa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:27 +0800 Subject: x86/uv: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-6-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/platform/uv/uv_irq.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 0ce673645432..474912d03f40 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -130,24 +131,14 @@ static int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { - const struct cpumask *eligible_cpu = cpumask_of(cpu); struct irq_cfg *cfg = irq_cfg(irq); unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; - int mmr_pnode, err; - unsigned int dest; + int mmr_pnode; BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); - if (err != 0) - return err; - if (limit == UV_AFFINITY_CPU) irq_set_status_flags(irq, IRQ_NO_BALANCING); else @@ -164,7 +155,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = dest; + entry->dest = cfg->dest_apicid; mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -238,9 +229,13 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { - int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade)); + int ret, irq; + struct irq_alloc_info info; - if (!irq) + init_irq_alloc_info(&info, cpumask_of(cpu)); + irq = irq_domain_alloc_irqs(NULL, 1, uv_blade_to_memory_nid(mmr_blade), + &info); + if (irq <= 0) return -EBUSY; ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, @@ -248,7 +243,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, if (ret == irq) uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); else - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); return ret; } @@ -283,6 +278,6 @@ void uv_teardown_irq(unsigned int irq) n = n->rb_right; } spin_unlock_irqrestore(&uv_irq_lock, irqflags); - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); } EXPORT_SYMBOL_GPL(uv_teardown_irq); -- cgit v1.2.1 From af87baedf2c23b1181f51323339210a26a64f7fc Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:28 +0800 Subject: x86/htirq: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for HTIRQ, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. This patch changes the interfaces between arch independent PCI driver and arch specific code. Currently HT_IRQ is only enabled on x86, so it does not affect other architectures. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-7-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 816f36e979ad..b307ee7a7148 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -61,31 +62,30 @@ static struct irq_chip ht_irq_chip = { .flags = IRQCHIP_SKIP_SET_WAKE, }; +int arch_alloc_ht_irq(struct pci_dev *dev) +{ + return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL); +} + +void arch_free_ht_irq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} + int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; struct ht_irq_msg msg; - unsigned dest; - int err; if (disable_apic) return -ENXIO; cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); msg.address_lo = HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | HT_IRQ_LOW_VECTOR(cfg->vector) | ((apic->irq_dest_mode == 0) ? HT_IRQ_LOW_DM_PHYSICAL : -- cgit v1.2.1 From a62b32cdd0a6324c959f40b3c9b928b275297066 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:29 +0800 Subject: x86/dmar: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for DMAR and interrupt remapping, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. The private definitions of irq_alloc_hwirqs()/irq_free_hwirqs() are a temporary solution, they will be removed once we have converted the interrupt remapping driver to use irqdomain framework. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-8-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 4 ++-- arch/x86/kernel/apic/msi.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 6224d316c405..86d897bc15dd 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -99,7 +99,7 @@ static inline bool setup_remapped_irq(int irq, } #endif /* CONFIG_IRQ_REMAP */ -#define dmar_alloc_hwirq() irq_alloc_hwirq(-1) -#define dmar_free_hwirq irq_free_hwirq +extern int dmar_alloc_hwirq(void); +extern void dmar_free_hwirq(int irq); #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 76cc2c902176..9be7d6d8a579 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -223,6 +223,16 @@ int arch_setup_dmar_msi(unsigned int irq) "edge"); return 0; } + +int dmar_alloc_hwirq(void) +{ + return irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); +} + +void dmar_free_hwirq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} #endif /* -- cgit v1.2.1 From 947045a2aac1157c85a24984c9a8128846ae7266 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:30 +0800 Subject: irq_remapping: Introduce new interfaces to support hierarchical irqdomains Introduce new interfaces for interrupt remapping drivers to support hierarchical irqdomains: 1) irq_remapping_get_ir_irq_domain(): get irqdomain associated with an interrupt remapping unit. IOAPIC/HPET drivers use this interface to get parent interrupt remapping irqdomain. 2) irq_remapping_get_irq_domain(): get irqdomain for an IRQ allocation. This is mainly used to support MSI irqdomain. We must build one MSI irqdomain for each interrupt remapping unit. MSI driver calls this interface to get MSI irqdomain associated with an IR irqdomain which manages the PCI devices. In a further step we will store the irqdomain pointer in the device struct to avoid this call in the irq allocation path. Architecture specific hooks: 1) arch_get_ir_parent_domain(): get parent irqdomain for IR irqdomain, which is x86_vector_domain on x86 platforms. 2) arch_create_msi_irq_domain(): create an MSI irqdomain associated with the interrupt remapping unit. We also add following callbacks into struct irq_remap_ops: struct irq_domain *(*get_ir_irq_domain)(struct irq_alloc_info *); struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *); Once all clients of IR have been converted to the new hierarchical irqdomain interfaces, we will: 1) Remove set_ioapic_entry, set_affinity, free_irq, compose_msi_msg, msi_alloc_irq, msi_setup_irq, setup_hpet_msi from struct remap_osp 2) Remove setup_ioapic_remapped_entry, free_remapped_irq, compose_remapped_msi_msg, setup_hpet_msi_remapped, setup_remapped_irq. 3) Simplify x86_io_apic_ops and x86_msi. We can achieve a way clearer architecture with all these changes applied. Signed-off-by: Jiang Liu Acked-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-9-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 37 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/irq_remapping.h | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 5b951ac56aa1..75a97a5bbfa8 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -113,10 +113,47 @@ struct irq_2_irte { #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct irq_domain; +struct pci_dev; +struct msi_desc; + +enum irq_alloc_type { + X86_IRQ_ALLOC_TYPE_IOAPIC = 1, + X86_IRQ_ALLOC_TYPE_HPET, + X86_IRQ_ALLOC_TYPE_MSI, + X86_IRQ_ALLOC_TYPE_MSIX, +}; struct irq_alloc_info { + enum irq_alloc_type type; u32 flags; const struct cpumask *mask; /* CPU mask for vector allocation */ + union { + int unused; +#ifdef CONFIG_HPET_TIMER + struct { + int hpet_id; + int hpet_index; + void *hpet_data; + }; +#endif +#ifdef CONFIG_PCI_MSI + struct { + struct pci_dev *msi_dev; + irq_hw_number_t msi_hwirq; + }; +#endif +#ifdef CONFIG_X86_IO_APIC + struct { + int ioapic_id; + int ioapic_pin; + int ioapic_node; + u32 ioapic_trigger : 1; + u32 ioapic_polarity : 1; + u32 ioapic_valid : 1; + struct IO_APIC_route_entry *ioapic_entry; + }; +#endif + }; }; enum { diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 86d897bc15dd..0cd6195cc375 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -22,6 +22,8 @@ #ifndef __X86_IRQ_REMAPPING_H #define __X86_IRQ_REMAPPING_H +#include +#include #include struct IO_APIC_route_entry; @@ -30,6 +32,7 @@ struct irq_chip; struct msi_msg; struct pci_dev; struct irq_cfg; +struct irq_alloc_info; #ifdef CONFIG_IRQ_REMAP @@ -56,6 +59,25 @@ extern bool setup_remapped_irq(int irq, void irq_remap_modify_chip_defaults(struct irq_chip *chip); +extern struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info); +extern struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info); +extern void irq_remapping_print_chip(struct irq_data *data, struct seq_file *p); + +/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ +static inline struct irq_domain * +arch_create_msi_irq_domain(struct irq_domain *parent) +{ + return NULL; +} + +/* Get parent irqdomain for interrupt remapping irqdomain */ +static inline struct irq_domain *arch_get_ir_parent_domain(void) +{ + return x86_vector_domain; +} + #else /* CONFIG_IRQ_REMAP */ static inline void set_irq_remapping_broken(void) { } @@ -97,6 +119,20 @@ static inline bool setup_remapped_irq(int irq, { return false; } + +static inline struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info) +{ + return NULL; +} + +static inline struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info) +{ + return NULL; +} + +#define irq_remapping_print_chip NULL #endif /* CONFIG_IRQ_REMAP */ extern int dmar_alloc_hwirq(void); -- cgit v1.2.1 From 3cb96f0c97330834929abe9bd2ca3c252a83def0 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:34 +0800 Subject: x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Srivatsa S. Bhat Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1428905519-23704-13-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hpet.h | 7 +- arch/x86/kernel/apic/msi.c | 166 +++++++++++++++++++++++++++++++++++++++----- arch/x86/kernel/hpet.c | 57 ++++----------- 3 files changed, 167 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 36f7125945e3..e87e9faf87a9 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -74,11 +74,16 @@ extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); struct irq_data; +struct hpet_dev; +struct irq_domain; + extern void hpet_msi_unmask(struct irq_data *data); extern void hpet_msi_mask(struct irq_data *data); -struct hpet_dev; extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); +extern struct irq_domain *hpet_create_irq_domain(int hpet_id); +extern int hpet_assign_irq(struct irq_domain *domain, + struct hpet_dev *dev, int dev_num); #ifdef CONFIG_PCI_MSI extern int default_setup_hpet_msi(unsigned int irq, unsigned int id); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9be7d6d8a579..10d9ae8f2166 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -51,6 +51,44 @@ void native_compose_msi_msg(struct pci_dev *pdev, MSI_DATA_VECTOR(cfg->vector); } +static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct irq_cfg *cfg = irqd_cfg(data); + + msg->address_hi = MSI_ADDR_BASE_HI; + + if (x2apic_enabled()) + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); + + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL : + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU : + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(cfg->dest_apicid); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED : + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); +} + +static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) +{ + struct irq_cfg *cfg = irqd_cfg(irq_data); + + msg->data &= ~MSI_DATA_VECTOR_MASK; + msg->data |= MSI_DATA_VECTOR(cfg->vector); + msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); +} + static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg, u8 hpet_id) { @@ -239,44 +277,43 @@ void dmar_free_hwirq(int irq) * MSI message composition */ #ifdef CONFIG_HPET_TIMER +static inline int hpet_dev_id(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} static int hpet_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); + struct irq_data *parent = data->parent_data; struct msi_msg msg; - unsigned int dest; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - hpet_msi_read(data->handler_data, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - hpet_msi_write(data->handler_data, &msg); + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + hpet_msi_read(data->handler_data, &msg); + msi_update_msg(&msg, data); + hpet_msi_write(data->handler_data, &msg); + } - return IRQ_SET_MASK_OK_NOCOPY; + return ret; } -static struct irq_chip hpet_msi_type = { +static struct irq_chip hpet_msi_controller = { .name = "HPET_MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = hpet_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_print_chip = irq_remapping_print_chip, + .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; int default_setup_hpet_msi(unsigned int irq, unsigned int id) { - struct irq_chip *chip = &hpet_msi_type; + struct irq_chip *chip = &hpet_msi_controller; struct msi_msg msg; int ret; @@ -291,4 +328,95 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id) irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; } + +static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + int ret; + + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_HPET) + return -EINVAL; + if (irq_find_mapping(domain, info->hpet_index)) { + pr_warn("IRQ for HPET%d already exists.\n", info->hpet_index); + return -EEXIST; + } + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_hwirq_and_chip(domain, virq, info->hpet_index, + &hpet_msi_controller, NULL); + irq_set_handler_data(virq, info->hpet_data); + __irq_set_handler(virq, handle_edge_irq, 0, "edge"); + } + + return ret; +} + +static void hpet_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + BUG_ON(nr_irqs > 1); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static void hpet_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); +} + +static void hpet_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); +} + +static struct irq_domain_ops hpet_domain_ops = { + .alloc = hpet_domain_alloc, + .free = hpet_domain_free, + .activate = hpet_domain_activate, + .deactivate = hpet_domain_deactivate, +}; + +struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct irq_domain *parent; + struct irq_alloc_info info; + + if (x86_vector_domain == NULL) + return NULL; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_id = hpet_id; + parent = irq_remapping_get_ir_irq_domain(&info); + if (parent == NULL) + parent = x86_vector_domain; + + return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, + (void *)(long)hpet_id); +} + +int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_data = dev; + info.hpet_id = hpet_dev_id(domain); + info.hpet_index = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, NULL); +} #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ae29554f57ea..e3bc18080052 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -306,8 +306,6 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static int hpet_setup_msi_irq(unsigned int irq); - static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { @@ -358,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode, hpet_enable_legacy_int(); } else { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - hpet_setup_msi_irq(hdev->irq); + irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); disable_irq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); @@ -424,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta, static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); static struct hpet_dev *hpet_devs; +static struct irq_domain *hpet_domain; void hpet_msi_unmask(struct irq_data *data) { @@ -474,32 +473,6 @@ static int hpet_msi_next_event(unsigned long delta, return hpet_next_event(delta, evt, hdev->num); } -static int hpet_setup_msi_irq(unsigned int irq) -{ - if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - irq_domain_free_irqs(irq, 1); - return -EINVAL; - } - return 0; -} - -static int hpet_assign_irq(struct hpet_dev *dev) -{ - int irq; - - irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); - if (irq <= 0) - return -EINVAL; - - irq_set_handler_data(irq, dev); - - if (hpet_setup_msi_irq(irq)) - return -EINVAL; - - dev->irq = irq; - return 0; -} - static irqreturn_t hpet_interrupt_handler(int irq, void *data) { struct hpet_dev *dev = (struct hpet_dev *)data; @@ -542,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) if (!(hdev->flags & HPET_DEV_VALID)) return; - if (hpet_setup_msi_irq(hdev->irq)) - return; - hdev->cpu = cpu; per_cpu(cpu_hpet_dev, cpu) = hdev; evt->name = hdev->name; @@ -576,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int id; unsigned int num_timers; unsigned int num_timers_used = 0; - int i; + int i, irq; if (hpet_msi_disable) return; @@ -589,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) num_timers++; /* Value read out starts from 0 */ hpet_print_config(); + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); if (!hpet_devs) return; @@ -603,15 +577,16 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) if (!(cfg & HPET_TN_FSB_CAP)) continue; + irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); + if (irq < 0) + continue; + + sprintf(hdev->name, "hpet%d", i); + hdev->num = i; + hdev->irq = irq; hdev->flags = 0; if (cfg & HPET_TN_PERIODIC_CAP) hdev->flags |= HPET_DEV_PERI_CAP; - hdev->num = i; - - sprintf(hdev->name, "hpet%d", i); - if (hpet_assign_irq(hdev)) - continue; - hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; @@ -711,10 +686,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n, } #else -static int hpet_setup_msi_irq(unsigned int irq) -{ - return 0; -} static void hpet_msi_capability_lookup(unsigned int start_timer) { return; -- cgit v1.2.1 From 52f518a3a7c2f80551a38d38be28bc9f335e713c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:35 +0800 Subject: x86/MSI: Use hierarchical irqdomains to manage MSI interrupts Enhance MSI code to support hierarchical irqdomains, it helps to make the architecture more clear. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-14-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + arch/x86/include/asm/hw_irq.h | 9 ++- arch/x86/include/asm/irq_remapping.h | 6 +- arch/x86/include/asm/msi.h | 7 ++ arch/x86/kernel/apic/msi.c | 141 +++++++++++++++++++---------------- arch/x86/kernel/apic/vector.c | 2 + 6 files changed, 94 insertions(+), 72 deletions(-) create mode 100644 arch/x86/include/asm/msi.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e75a96c38c58..26e066579637 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -914,6 +914,7 @@ config X86_LOCAL_APIC depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ select IRQ_DOMAIN_HIERARCHY + select PCI_MSI_IRQ_DOMAIN if PCI_MSI config X86_IO_APIC def_bool y diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 75a97a5bbfa8..05829e973a2a 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -110,9 +110,10 @@ struct irq_2_irte { }; #endif /* CONFIG_IRQ_REMAP */ +struct irq_domain; + #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; -struct irq_domain; struct pci_dev; struct msi_desc; @@ -214,6 +215,12 @@ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_PCI_MSI +extern void arch_init_msi_domain(struct irq_domain *domain); +#else +static inline void arch_init_msi_domain(struct irq_domain *domain) { } +#endif + /* Statistics */ extern atomic_t irq_err_count; extern atomic_t irq_mis_count; diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 0cd6195cc375..0d3bbd0e2c42 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -66,11 +66,7 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info); extern void irq_remapping_print_chip(struct irq_data *data, struct seq_file *p); /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ -static inline struct irq_domain * -arch_create_msi_irq_domain(struct irq_domain *parent) -{ - return NULL; -} +extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent); /* Get parent irqdomain for interrupt remapping irqdomain */ static inline struct irq_domain *arch_get_ir_parent_domain(void) diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h new file mode 100644 index 000000000000..93724cc62177 --- /dev/null +++ b/arch/x86/include/asm/msi.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_MSI_H +#define _ASM_X86_MSI_H +#include + +typedef struct irq_alloc_info msi_alloc_info_t; + +#endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 10d9ae8f2166..c426cd58844e 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Convert to hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,6 +23,8 @@ #include #include +static struct irq_domain *msi_default_domain; + void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, struct msi_msg *msg, u8 hpet_id) @@ -114,102 +118,107 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, return 0; } -static int -msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) -{ - struct irq_cfg *cfg = irqd_cfg(data); - struct msi_msg msg; - unsigned int dest; - int ret; - - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - __get_cached_msi_msg(data->msi_desc, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - __pci_write_msi_msg(data->msi_desc, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. */ -static struct irq_chip msi_chip = { +static struct irq_chip pci_msi_controller = { .name = "PCI-MSI", .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, - .irq_ack = apic_ack_edge, - .irq_set_affinity = msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_print_chip = irq_remapping_print_chip, + .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset) +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - struct irq_chip *chip = &msi_chip; - struct msi_msg msg; - unsigned int irq = irq_base + irq_offset; - int ret; + struct irq_domain *domain; + struct irq_alloc_info info; - ret = msi_compose_msg(dev, irq, &msg, -1); - if (ret < 0) - return ret; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_MSI; + info.msi_dev = dev; - irq_set_msi_desc_off(irq_base, irq_offset, msidesc); + domain = irq_remapping_get_irq_domain(&info); + if (domain == NULL) + domain = msi_default_domain; + if (domain == NULL) + return -ENOSYS; - /* - * MSI-X message is written per-IRQ, the offset is always 0. - * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. - */ - if (!irq_offset) - pci_write_msi_msg(irq, &msg); + return pci_msi_domain_alloc_irqs(domain, dev, nvec, type); +} - setup_remapped_irq(irq, irq_cfg(irq), chip); +void native_teardown_msi_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); +} - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); +static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->msi_hwirq; +} - dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq); +static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + + init_irq_alloc_info(arg, NULL); + arg->msi_dev = pdev; + if (desc->msi_attrib.is_msix) { + arg->type = X86_IRQ_ALLOC_TYPE_MSIX; + } else { + arg->type = X86_IRQ_ALLOC_TYPE_MSI; + arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + } return 0; } -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { - struct msi_desc *msidesc; - int irq, ret; + arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); +} - /* Multiple MSI vectors only supported with interrupt remapping */ - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; +static struct msi_domain_ops pci_msi_domain_ops = { + .get_hwirq = pci_msi_get_hwirq, + .msi_prepare = pci_msi_prepare, + .set_desc = pci_msi_set_desc, +}; - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); - if (irq <= 0) - return -ENOSPC; +static struct msi_domain_info pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) { - irq_domain_free_irqs(irq, 1); - return ret; - } +void arch_init_msi_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; - } - return 0; + msi_default_domain = pci_msi_create_irq_domain(NULL, + &pci_msi_domain_info, parent); + if (!msi_default_domain) + pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); } -void native_teardown_msi_irq(unsigned int irq) +#ifdef CONFIG_IRQ_REMAP +struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) { - irq_domain_free_irqs(irq, 1); + return msi_create_irq_domain(NULL, &pci_msi_domain_info, parent); } +#endif #ifdef CONFIG_DMAR_TABLE static int diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6358d8d351f5..a8d82896be75 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -364,6 +364,8 @@ int __init arch_early_irq_init(void) BUG_ON(x86_vector_domain == NULL); irq_set_default_host(x86_vector_domain); + arch_init_msi_domain(x86_vector_domain); + return arch_early_ioapic_init(); } -- cgit v1.2.1 From 80aa283364a17998dceb577bd185e3380b927544 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:36 +0800 Subject: x86/irq: Directly call native_compose_msi_msg() for DMAR IRQ DMAR interrupt won't be remapped by interrupt remapping hardware, so directly call native_compose_msi_msg() for DMAR IRQ to compose MSI message data. This will help to simplify MSI code later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-15-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c426cd58844e..9adb87100ffe 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -259,12 +259,10 @@ static struct irq_chip dmar_msi_type = { int arch_setup_dmar_msi(unsigned int irq) { - int ret; struct msi_msg msg; + struct irq_cfg *cfg = irq_cfg(irq); - ret = msi_compose_msg(NULL, irq, &msg, -1); - if (ret < 0) - return ret; + native_compose_msi_msg(NULL, irq, cfg->dest_apicid, &msg, -1); dmar_msi_write(irq, &msg); irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, "edge"); -- cgit v1.2.1 From 7a53a12162cbe5feb66380b96cc794a031a8f39a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:39 +0800 Subject: irq_remapping: Clean up unused MSI related code Now MSI interrupt has been converted to new hierarchical irqdomain interfaces, so remove legacy MSI related code and interfaces. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Rafael J. Wysocki Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Link: http://lkml.kernel.org/r/1428905519-23704-18-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 13 ------------- arch/x86/include/asm/pci.h | 5 ----- arch/x86/kernel/x86_init.c | 2 -- 3 files changed, 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 0d3bbd0e2c42..b978c68f5d29 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -48,10 +48,6 @@ extern int setup_ioapic_remapped_entry(int irq, int vector, struct io_apic_irq_attr *attr); extern void free_remapped_irq(int irq); -extern void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id); -extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); extern void panic_if_irq_remap(const char *msg); extern bool setup_remapped_irq(int irq, struct irq_cfg *cfg, @@ -91,15 +87,6 @@ static inline int setup_ioapic_remapped_entry(int irq, return -ENODEV; } static inline void free_remapped_irq(int irq) { } -static inline void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ -} -static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - return -ENODEV; -} static inline void panic_if_irq_remap(const char *msg) { diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 4e370a5d8117..d8c80ff32e8c 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -96,15 +96,10 @@ extern void pci_iommu_alloc(void); #ifdef CONFIG_PCI_MSI /* implemented in arch/x86/kernel/apic/io_apic. */ struct msi_desc; -void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, u8 hpet_id); int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev); -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset); #else -#define native_compose_msi_msg NULL #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 234b0722de53..b094d691f2fe 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -111,11 +111,9 @@ EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, - .compose_msi_msg = native_compose_msi_msg, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, - .setup_hpet_msi = default_setup_hpet_msi, }; /* MSI arch specific hooks */ -- cgit v1.2.1 From b1855c752e67d1125d41fadb499014b49a245db8 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:40 +0800 Subject: x86/MSI: Clean up unused MSI related code and interfaces Now MSI interrupt has been converted to new hierarchical irqdomain interfaces, so remove legacy MSI related code and interfaces. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Yijing Wang Link: http://lkml.kernel.org/r/1428905519-23704-19-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hpet.h | 9 ------- arch/x86/include/asm/x86_init.h | 4 --- arch/x86/kernel/apic/msi.c | 55 +++-------------------------------------- 3 files changed, 4 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index e87e9faf87a9..5fa9fb0f8809 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -85,15 +85,6 @@ extern struct irq_domain *hpet_create_irq_domain(int hpet_id); extern int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, int dev_num); -#ifdef CONFIG_PCI_MSI -extern int default_setup_hpet_msi(unsigned int irq, unsigned int id); -#else -static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - return -EINVAL; -} -#endif - #ifdef CONFIG_HPET_EMULATE_RTC #include diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f58a9c7a3c86..1649bb9ca27c 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -175,13 +175,9 @@ struct msi_msg; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); - void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, - u8 hpet_id); void (*teardown_msi_irq)(unsigned int irq); void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); - int (*setup_hpet_msi)(unsigned int irq, unsigned int id); }; struct IO_APIC_route_entry; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9adb87100ffe..9fe7a08479fa 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -25,16 +25,12 @@ static struct irq_domain *msi_default_domain; -void native_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +static void native_compose_msi_msg(struct irq_cfg *cfg, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg(irq); - msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); msg->address_lo = MSI_ADDR_BASE_LO | @@ -44,7 +40,7 @@ void native_compose_msi_msg(struct pci_dev *pdev, ((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU : MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | @@ -93,31 +89,6 @@ static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); } -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); - - return 0; -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -262,7 +233,7 @@ int arch_setup_dmar_msi(unsigned int irq) struct msi_msg msg; struct irq_cfg *cfg = irq_cfg(irq); - native_compose_msi_msg(NULL, irq, cfg->dest_apicid, &msg, -1); + native_compose_msi_msg(cfg, &msg); dmar_msi_write(irq, &msg); irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, "edge"); @@ -318,24 +289,6 @@ static struct irq_chip hpet_msi_controller = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - struct irq_chip *chip = &hpet_msi_controller; - struct msi_msg msg; - int ret; - - ret = msi_compose_msg(NULL, irq, &msg, id); - if (ret < 0) - return ret; - - hpet_msi_write(irq_get_handler_data(irq), &msg); - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - setup_remapped_irq(irq, irq_cfg(irq), chip); - - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); - return 0; -} - static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { -- cgit v1.2.1 From 34742db8eaf9ff364034f214ee5827701e131d4b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:41 +0800 Subject: iommu/vt-d: Refine the interfaces to create IRQ for DMAR unit Refine the interfaces to create IRQ for DMAR unit. It's a preparation for converting DMAR IRQ to hierarchical irqdomain on x86. It also moves dmar_alloc_hwirq()/dmar_free_hwirq() from irq_remapping.h to dmar.h. They are not irq_remapping specific. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Vinod Koul Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Tony Luck Cc: Fenghua Yu Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-20-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 4 ---- arch/x86/kernel/apic/msi.c | 24 +++++++++++++----------- 2 files changed, 13 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index b978c68f5d29..bacac1052262 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -117,8 +117,4 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info) #define irq_remapping_print_chip NULL #endif /* CONFIG_IRQ_REMAP */ - -extern int dmar_alloc_hwirq(void); -extern void dmar_free_hwirq(int irq); - #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9fe7a08479fa..ca6250439acc 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -228,25 +228,27 @@ static struct irq_chip dmar_msi_type = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_setup_dmar_msi(unsigned int irq) +int dmar_alloc_hwirq(int id, int node, void *arg) { + int irq; struct msi_msg msg; - struct irq_cfg *cfg = irq_cfg(irq); - native_compose_msi_msg(cfg, &msg); - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} + irq = irq_domain_alloc_irqs(NULL, 1, node, NULL); + if (irq > 0) { + irq_set_handler_data(irq, arg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, + handle_edge_irq, "edge"); + native_compose_msi_msg(irq_cfg(irq), &msg); + dmar_msi_write(irq, &msg); + } -int dmar_alloc_hwirq(void) -{ - return irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + return irq; } void dmar_free_hwirq(int irq) { + irq_set_handler_data(irq, NULL); + irq_set_handler(irq, NULL); irq_domain_free_irqs(irq, 1); } #endif -- cgit v1.2.1 From 0921f1da6425f05a1f56803069124b7ec13b79e2 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:42 +0800 Subject: x86/irq: Use hierarchical irqdomain to manage DMAR interrupts Enhance DMAR code to support hierarchical irqdomain, it helps to make the architecture more clear. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-21-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 7 ++ arch/x86/kernel/apic/msi.c | 153 ++++++++++++++++++++++++++---------------- 2 files changed, 103 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 05829e973a2a..bf1725047c27 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -122,6 +122,7 @@ enum irq_alloc_type { X86_IRQ_ALLOC_TYPE_HPET, X86_IRQ_ALLOC_TYPE_MSI, X86_IRQ_ALLOC_TYPE_MSIX, + X86_IRQ_ALLOC_TYPE_DMAR, }; struct irq_alloc_info { @@ -153,6 +154,12 @@ struct irq_alloc_info { u32 ioapic_valid : 1; struct IO_APIC_route_entry *ioapic_entry; }; +#endif +#ifdef CONFIG_DMAR_TABLE + struct { + int dmar_id; + void *dmar_data; + }; #endif }; }; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ca6250439acc..f23d17d759b6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -25,32 +25,6 @@ static struct irq_domain *msi_default_domain; -static void native_compose_msi_msg(struct irq_cfg *cfg, struct msi_msg *msg) -{ - msg->address_hi = MSI_ADDR_BASE_HI; - - if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); - - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL : - MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(cfg->dest_apicid); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); -} - static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { struct irq_cfg *cfg = irqd_cfg(data); @@ -87,6 +61,9 @@ static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) msg->data |= MSI_DATA_VECTOR(cfg->vector); msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); } /* @@ -196,59 +173,121 @@ static int dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest, irq = data->irq; + struct irq_data *parent = data->parent_data; struct msi_msg msg; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); - - dmar_msi_write(irq, &msg); + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + dmar_msi_read(data->irq, &msg); + msi_update_msg(&msg, data); + dmar_msi_write(data->irq, &msg); + } - return IRQ_SET_MASK_OK_NOCOPY; + return ret; } -static struct irq_chip dmar_msi_type = { +static struct irq_chip dmar_msi_controller = { .name = "DMAR_MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = dmar_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int dmar_alloc_hwirq(int id, int node, void *arg) +static int dmar_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + int ret; + + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_DMAR) + return -EINVAL; + if (irq_find_mapping(domain, info->dmar_id)) { + pr_warn("IRQ for DMAR%d already exists.\n", info->dmar_id); + return -EEXIST; + } + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + irq_domain_set_hwirq_and_chip(domain, virq, info->dmar_id, + &dmar_msi_controller, NULL); + irq_set_handler_data(virq, info->dmar_data); + __irq_set_handler(virq, handle_edge_irq, 0, "edge"); + } + + return ret; +} + +static void dmar_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + BUG_ON(nr_irqs > 1); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static void dmar_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - int irq; struct msi_msg msg; - irq = irq_domain_alloc_irqs(NULL, 1, node, NULL); - if (irq > 0) { - irq_set_handler_data(irq, arg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, - handle_edge_irq, "edge"); - native_compose_msi_msg(irq_cfg(irq), &msg); - dmar_msi_write(irq, &msg); + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + dmar_msi_write(irq_data->irq, &msg); +} + +static void dmar_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + dmar_msi_write(irq_data->irq, &msg); +} + +static struct irq_domain_ops dmar_domain_ops = { + .alloc = dmar_domain_alloc, + .free = dmar_domain_free, + .activate = dmar_domain_activate, + .deactivate = dmar_domain_deactivate, +}; + +static struct irq_domain *dmar_get_irq_domain(void) +{ + static struct irq_domain *dmar_domain; + static DEFINE_MUTEX(dmar_lock); + + mutex_lock(&dmar_lock); + if (dmar_domain == NULL) { + dmar_domain = irq_domain_add_tree(NULL, &dmar_domain_ops, NULL); + if (dmar_domain) + dmar_domain->parent = x86_vector_domain; } + mutex_unlock(&dmar_lock); + + return dmar_domain; +} + +int dmar_alloc_hwirq(int id, int node, void *arg) +{ + struct irq_domain *domain = dmar_get_irq_domain(); + struct irq_alloc_info info; + + if (!domain) + return -1; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_DMAR; + info.dmar_id = id; + info.dmar_data = arg; - return irq; + return irq_domain_alloc_irqs(domain, 1, node, &info); } void dmar_free_hwirq(int irq) { - irq_set_handler_data(irq, NULL); - irq_set_handler(irq, NULL); irq_domain_free_irqs(irq, 1); } #endif -- cgit v1.2.1 From 49e07d8f28c05347f237146a9ec66f6d958db83e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:43 +0800 Subject: x86/htirq: Use hierarchical irqdomain to manage Hypertransport interrupts We have slightly changed the architecture interfaces to support htirq PCI driver. It's safe because currently Hypertransport interrupt is only enabled on x86 platforms. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-22-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 13 ++++ arch/x86/kernel/apic/htirq.c | 161 +++++++++++++++++++++++++++++++----------- arch/x86/kernel/apic/vector.c | 1 + 3 files changed, 134 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index bf1725047c27..5e0b031d1a7d 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -160,6 +160,14 @@ struct irq_alloc_info { int dmar_id; void *dmar_data; }; +#endif +#ifdef CONFIG_HT_IRQ + struct { + int ht_pos; + int ht_idx; + struct pci_dev *ht_dev; + void *ht_update; + }; #endif }; }; @@ -227,6 +235,11 @@ extern void arch_init_msi_domain(struct irq_domain *domain); #else static inline void arch_init_msi_domain(struct irq_domain *domain) { } #endif +#ifdef CONFIG_HT_IRQ +extern void arch_init_htirq_domain(struct irq_domain *domain); +#else +static inline void arch_init_htirq_domain(struct irq_domain *domain) { } +#endif /* Statistics */ extern atomic_t irq_err_count; diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index b307ee7a7148..1cae104415ea 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Add support of hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,70 +21,104 @@ #include #include +static struct irq_domain *htirq_domain; + /* * Hypertransport interrupt support */ -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - static int ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; + struct irq_data *parent = data->parent_data; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - target_ht_irq(data->irq, dest, cfg->vector); - return IRQ_SET_MASK_OK_NOCOPY; + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(data); + + fetch_ht_irq_msg(data->irq, &msg); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | + HT_IRQ_LOW_DEST_ID_MASK); + msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); + write_ht_irq_msg(data->irq, &msg); + } + + return ret; } static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = ht_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_alloc_ht_irq(struct pci_dev *dev) +static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL); + struct ht_irq_cfg *ht_cfg; + struct irq_alloc_info *info = arg; + struct pci_dev *dev; + irq_hw_number_t hwirq; + int ret; + + if (nr_irqs > 1 || !info) + return -EINVAL; + + dev = info->ht_dev; + hwirq = (info->ht_idx & 0xFF) | + PCI_DEVID(dev->bus->number, dev->devfn) << 8 | + (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24; + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL); + if (!ht_cfg) + return -ENOMEM; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(ht_cfg); + return ret; + } + + /* Initialize msg to a value that will never match the first write. */ + ht_cfg->msg.address_lo = 0xffffffff; + ht_cfg->msg.address_hi = 0xffffffff; + ht_cfg->dev = info->ht_dev; + ht_cfg->update = info->ht_update; + ht_cfg->pos = info->ht_pos; + ht_cfg->idx = 0x10 + (info->ht_idx * 2); + irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg, + handle_edge_irq, ht_cfg, "edge"); + + return 0; } -void arch_free_ht_irq(int irq) +static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - irq_domain_free_irqs(irq, 1); + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_domain_free_irqs_top(domain, virq, nr_irqs); } -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +static void htirq_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - struct irq_cfg *cfg; struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(irq_data); - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); - msg.address_lo = HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | @@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) HT_IRQ_LOW_MT_FIXED : HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; + write_ht_irq_msg(irq_data->irq, &msg); +} - write_ht_irq_msg(irq, &msg); +static void htirq_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct ht_irq_msg msg; - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + memset(&msg, 0, sizeof(msg)); + write_ht_irq_msg(irq_data->irq, &msg); +} - dev_dbg(&dev->dev, "irq %d for HT\n", irq); +static struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, +}; - return 0; +void arch_init_htirq_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; + + htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL); + if (!htirq_domain) + pr_warn("failed to initialize irqdomain for HTIRQ.\n"); + else + htirq_domain->parent = parent; +} + +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update) +{ + struct irq_alloc_info info; + + if (!htirq_domain) + return -ENOSYS; + + init_irq_alloc_info(&info, NULL); + info.ht_idx = idx; + info.ht_pos = pos; + info.ht_dev = dev; + info.ht_update = update; + + return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev), + &info); +} + +void arch_teardown_ht_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a8d82896be75..b4b6b5a13440 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -365,6 +365,7 @@ int __init arch_early_irq_init(void) irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); + arch_init_htirq_domain(x86_vector_domain); return arch_early_ioapic_init(); } -- cgit v1.2.1 From 43fe1abc18a237581663a51da4c2f8e57684c223 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:44 +0800 Subject: x86/uv: Use hierarchical irqdomain to manage UV interrupts Enhance UV code to support hierarchical irqdomain, it helps to make the architecture more clear. We construct hwirq based on mmr_blade and mmr_offset, but mmr_offset has type unsigned long, it may exceed the range of irq_hw_number_t. So help about the way to construct hwirq based on mmr_blade and mmr_offset is welcomed! Folded a patch from Dimitri Sivanich to fix a bug on UV platforms, please refer to: http://lkml.org/lkml/2014/12/16/351 Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Russ Anderson Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-23-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 9 ++ arch/x86/platform/uv/uv_irq.c | 290 ++++++++++++++++-------------------------- 2 files changed, 119 insertions(+), 180 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 5e0b031d1a7d..75d0db1db8a0 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -123,6 +123,7 @@ enum irq_alloc_type { X86_IRQ_ALLOC_TYPE_MSI, X86_IRQ_ALLOC_TYPE_MSIX, X86_IRQ_ALLOC_TYPE_DMAR, + X86_IRQ_ALLOC_TYPE_UV, }; struct irq_alloc_info { @@ -168,6 +169,14 @@ struct irq_alloc_info { struct pci_dev *ht_dev; void *ht_update; }; +#endif +#ifdef CONFIG_X86_UV + struct { + int uv_limit; + int uv_blade; + unsigned long uv_offset; + char *uv_name; + }; #endif }; }; diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 474912d03f40..54af6e388a12 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -19,17 +19,31 @@ #include /* MMR offset and pnode of hub sourcing interrupts for a given irq */ -struct uv_irq_2_mmr_pnode{ - struct rb_node list; +struct uv_irq_2_mmr_pnode { unsigned long offset; int pnode; - int irq; }; -static DEFINE_SPINLOCK(uv_irq_lock); -static struct rb_root uv_irq_root; +static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = cfg->dest_apicid; -static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); + uv_write_global_mmr64(info->pnode, info->offset, mmr_value); +} static void uv_noop(struct irq_data *data) { } @@ -38,6 +52,24 @@ static void uv_ack_apic(struct irq_data *data) ack_APIC_irq(); } +static int +uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_data *parent = data->parent_data; + struct irq_cfg *cfg = irqd_cfg(data); + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + uv_program_mmr(cfg, data->chip_data); + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + } + + return ret; +} + static struct irq_chip uv_irq_chip = { .name = "UV-CORE", .irq_mask = uv_noop, @@ -46,179 +78,99 @@ static struct irq_chip uv_irq_chip = { .irq_set_affinity = uv_set_irq_affinity, }; -/* - * Add offset and pnode information of the hub sourcing interrupts to the - * rb tree for a specific irq. - */ -static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - struct rb_node **link = &uv_irq_root.rb_node; - struct rb_node *parent = NULL; - struct uv_irq_2_mmr_pnode *n; - struct uv_irq_2_mmr_pnode *e; - unsigned long irqflags; - - n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, - uv_blade_to_memory_nid(blade)); - if (!n) + struct uv_irq_2_mmr_pnode *chip_data; + struct irq_alloc_info *info = arg; + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + int ret; + + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_UV) + return -EINVAL; + + chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL, + irq_data->node); + if (!chip_data) return -ENOMEM; - n->irq = irq; - n->offset = offset; - n->pnode = uv_blade_to_pnode(blade); - spin_lock_irqsave(&uv_irq_lock, irqflags); - /* Find the right place in the rbtree: */ - while (*link) { - parent = *link; - e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); - - if (unlikely(irq == e->irq)) { - /* irq entry exists */ - e->pnode = uv_blade_to_pnode(blade); - e->offset = offset; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - kfree(n); - return 0; - } - - if (irq < e->irq) - link = &(*link)->rb_left; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + if (info->uv_limit == UV_AFFINITY_CPU) + irq_set_status_flags(virq, IRQ_NO_BALANCING); else - link = &(*link)->rb_right; + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + + chip_data->pnode = uv_blade_to_pnode(info->uv_blade); + chip_data->offset = info->uv_offset; + irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data, + handle_percpu_irq, NULL, info->uv_name); + } else { + kfree(chip_data); } - /* Insert the node into the rbtree. */ - rb_link_node(&n->list, parent, link); - rb_insert_color(&n->list, &uv_irq_root); - - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; + return ret; } -/* Retrieve offset and pnode information from the rb tree for a specific irq */ -int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +static void uv_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - - if (e->irq == irq) { - *offset = e->offset; - *pnode = e->pnode; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; - } - - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return -1; + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_clear_status_flags(virq, IRQ_NO_BALANCING); + irq_domain_free_irqs_top(domain, virq, nr_irqs); } /* * Re-target the irq to the specified CPU and enable the specified MMR located * on the specified blade to allow the sending of MSIs to the specified CPU. */ -static int -arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset, int limit) +static void uv_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - - if (limit == UV_AFFINITY_CPU) - irq_set_status_flags(irq, IRQ_NO_BALANCING); - else - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - - irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = cfg->dest_apicid; - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; + uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); } /* * Disable the specified MMR located on the specified blade so that MSIs are * longer allowed to be sent. */ -static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +static void uv_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) { unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->mask = 1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); } -static int -uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; - unsigned long mmr_value, mmr_offset; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - if (apic_set_affinity(data, mask, &dest)) - return -1; - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - /* Get previously stored MMR and pnode of hub sourcing interrupts */ - if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode)) - return -1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +static struct irq_domain_ops uv_domain_ops = { + .alloc = uv_domain_alloc, + .free = uv_domain_free, + .activate = uv_domain_activate, + .deactivate = uv_domain_deactivate, +}; - if (cfg->move_in_progress) - send_cleanup_vector(cfg); +static struct irq_domain *uv_get_irq_domain(void) +{ + static struct irq_domain *uv_domain; + static DEFINE_MUTEX(uv_lock); + + mutex_lock(&uv_lock); + if (uv_domain == NULL) { + uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL); + if (uv_domain) + uv_domain->parent = x86_vector_domain; + } + mutex_unlock(&uv_lock); - return IRQ_SET_MASK_OK_NOCOPY; + return uv_domain; } /* @@ -229,23 +181,21 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { - int ret, irq; struct irq_alloc_info info; + struct irq_domain *domain = uv_get_irq_domain(); - init_irq_alloc_info(&info, cpumask_of(cpu)); - irq = irq_domain_alloc_irqs(NULL, 1, uv_blade_to_memory_nid(mmr_blade), - &info); - if (irq <= 0) - return -EBUSY; - - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, - limit); - if (ret == irq) - uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); - else - irq_domain_free_irqs(irq, 1); + if (!domain) + return -ENOMEM; - return ret; + init_irq_alloc_info(&info, cpumask_of(cpu)); + info.type = X86_IRQ_ALLOC_TYPE_UV; + info.uv_limit = limit; + info.uv_blade = mmr_blade; + info.uv_offset = mmr_offset; + info.uv_name = irq_name; + + return irq_domain_alloc_irqs(domain, 1, + uv_blade_to_memory_nid(mmr_blade), &info); } EXPORT_SYMBOL_GPL(uv_setup_irq); @@ -258,26 +208,6 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); */ void uv_teardown_irq(unsigned int irq) { - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - if (e->irq == irq) { - arch_disable_uv_irq(e->pnode, e->offset); - rb_erase(n, &uv_irq_root); - kfree(e); - break; - } - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); irq_domain_free_irqs(irq, 1); } EXPORT_SYMBOL_GPL(uv_teardown_irq); -- cgit v1.2.1 From 81dabe2e739d5e0ad8ca2369738fb84bd64f967d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:45 +0800 Subject: x86/irq: Normalize x86 irq_chip name Some irq_chip names use underscore, others use hyphen. So normalize them to use hyphen as separator. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-24-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index f23d17d759b6..d17eb6a52c84 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -188,7 +188,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, } static struct irq_chip dmar_msi_controller = { - .name = "DMAR_MSI", + .name = "DMAR-MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = irq_chip_ack_parent, @@ -319,7 +319,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, } static struct irq_chip hpet_msi_controller = { - .name = "HPET_MSI", + .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = irq_chip_ack_parent, -- cgit v1.2.1 From 68682a2687bf7dbe51309d297757a7ea6a96d312 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:46 +0800 Subject: x86/MSI: Simplify the way to deal with remapped MSI interrupts Simplify the way to deal with remapped MSI interrupts, so we can remove irq_chip.irq_print_chip later. We simply change the name when the setup detects that the parent domain is remapping. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-25-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d17eb6a52c84..87df03ae99ba 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -77,7 +77,6 @@ static struct irq_chip pci_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_print_chip = irq_remapping_print_chip, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, @@ -143,7 +142,7 @@ static struct msi_domain_ops pci_msi_domain_ops = { static struct msi_domain_info pci_msi_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + MSI_FLAG_PCI_MSIX, .ops = &pci_msi_domain_ops, .chip = &pci_msi_controller, .handler = handle_edge_irq, @@ -162,9 +161,29 @@ void arch_init_msi_domain(struct irq_domain *parent) } #ifdef CONFIG_IRQ_REMAP +static struct irq_chip pci_msi_ir_controller = { + .name = "IR-PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = pci_msi_domain_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct msi_domain_info pci_msi_ir_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_ir_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) { - return msi_create_irq_domain(NULL, &pci_msi_domain_info, parent); + return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent); } #endif @@ -325,7 +344,6 @@ static struct irq_chip hpet_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = hpet_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_print_chip = irq_remapping_print_chip, .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -402,6 +420,8 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) parent = irq_remapping_get_ir_irq_domain(&info); if (parent == NULL) parent = x86_vector_domain; + else + hpet_msi_controller.name = "IR-HPET-MSI"; return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, (void *)(long)hpet_id); -- cgit v1.2.1 From 90d84fe95dd6b418383aa0e0e5cace8f1b1e7e30 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:47 +0800 Subject: x86/MSI: Replace msi_update_msg() with irq_chip_compose_msi_msg() Function irq_chip_compose_msi_msg() can achieve the same goal as msi_update_msg(), so remove msi_update_msg(). Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-26-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 87df03ae99ba..5b5ef5bd23f5 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -53,19 +53,6 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) MSI_DATA_VECTOR(cfg->vector); } -static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) -{ - struct irq_cfg *cfg = irqd_cfg(irq_data); - - msg->data &= ~MSI_DATA_VECTOR_MASK; - msg->data |= MSI_DATA_VECTOR(cfg->vector); - msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); - if (x2apic_enabled()) - msg->address_hi = MSI_ADDR_BASE_HI | - MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -198,8 +185,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0) { - dmar_msi_read(data->irq, &msg); - msi_update_msg(&msg, data); + irq_chip_compose_msi_msg(data, &msg); dmar_msi_write(data->irq, &msg); } @@ -329,8 +315,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - hpet_msi_read(data->handler_data, &msg); - msi_update_msg(&msg, data); + irq_chip_compose_msi_msg(data, &msg); hpet_msi_write(data->handler_data, &msg); } -- cgit v1.2.1 From 62ac1780830ed64a9a46f80a03e91de71957d670 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:48 +0800 Subject: x86/irq: Implement irq_chip.irq_write_msi_msg for MSI/DMAR/HPET irq_chips Implement irq_chip.irq_write_msi_msg for MSI/DMAR/HPET irq_chips, they will be used to replace duplicated code. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-27-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5b5ef5bd23f5..3c825867aeb5 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -192,6 +192,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + dmar_msi_write(data->irq, msg); +} + static struct irq_chip dmar_msi_controller = { .name = "DMAR-MSI", .irq_unmask = dmar_msi_unmask, @@ -200,6 +205,7 @@ static struct irq_chip dmar_msi_controller = { .irq_set_affinity = dmar_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -322,6 +328,11 @@ static int hpet_msi_set_affinity(struct irq_data *data, return ret; } +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(data->handler_data, msg); +} + static struct irq_chip hpet_msi_controller = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, @@ -330,6 +341,7 @@ static struct irq_chip hpet_msi_controller = { .irq_set_affinity = hpet_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -- cgit v1.2.1 From e390d895ae14ad655c6b830e62a22a81b69290ef Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:49 +0800 Subject: x86/irq: Simplify MSI/DMAR/HPET implementation by using common code Use common MSI interfaces instead of private implementations of the same functionality to simplify DMAR/HPET driver implementation. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-28-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 192 +++++++++++++-------------------------------- 1 file changed, 54 insertions(+), 138 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 3c825867aeb5..109584261c4e 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -62,10 +62,8 @@ static struct irq_chip pci_msi_controller = { .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, - .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -153,9 +151,7 @@ static struct irq_chip pci_msi_ir_controller = { .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -175,23 +171,6 @@ struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) #endif #ifdef CONFIG_DMAR_TABLE -static int -dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_data *parent = data->parent_data; - struct msi_msg msg; - int ret; - - ret = parent->chip->irq_set_affinity(parent, mask, force); - if (ret >= 0) { - irq_chip_compose_msi_msg(data, &msg); - dmar_msi_write(data->irq, &msg); - } - - return ret; -} - static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { dmar_msi_write(data->irq, msg); @@ -202,67 +181,37 @@ static struct irq_chip dmar_msi_controller = { .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = dmar_msi_set_affinity, + .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static int dmar_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) +static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) { - struct irq_alloc_info *info = arg; - int ret; - - if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_DMAR) - return -EINVAL; - if (irq_find_mapping(domain, info->dmar_id)) { - pr_warn("IRQ for DMAR%d already exists.\n", info->dmar_id); - return -EEXIST; - } - - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret >= 0) { - irq_domain_set_hwirq_and_chip(domain, virq, info->dmar_id, - &dmar_msi_controller, NULL); - irq_set_handler_data(virq, info->dmar_data); - __irq_set_handler(virq, handle_edge_irq, 0, "edge"); - } - - return ret; + return arg->dmar_id; } -static void dmar_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) +static int dmar_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - BUG_ON(nr_irqs > 1); - irq_domain_free_irqs_top(domain, virq, nr_irqs); -} + irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL, + handle_edge_irq, arg->dmar_data, "edge"); -static void dmar_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct msi_msg msg; - - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - dmar_msi_write(irq_data->irq, &msg); + return 0; } -static void dmar_domain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct msi_msg msg; - - memset(&msg, 0, sizeof(msg)); - dmar_msi_write(irq_data->irq, &msg); -} +static struct msi_domain_ops dmar_msi_domain_ops = { + .get_hwirq = dmar_msi_get_hwirq, + .msi_init = dmar_msi_init, +}; -static struct irq_domain_ops dmar_domain_ops = { - .alloc = dmar_domain_alloc, - .free = dmar_domain_free, - .activate = dmar_domain_activate, - .deactivate = dmar_domain_deactivate, +static struct msi_domain_info dmar_msi_domain_info = { + .ops = &dmar_msi_domain_ops, + .chip = &dmar_msi_controller, }; static struct irq_domain *dmar_get_irq_domain(void) @@ -271,11 +220,9 @@ static struct irq_domain *dmar_get_irq_domain(void) static DEFINE_MUTEX(dmar_lock); mutex_lock(&dmar_lock); - if (dmar_domain == NULL) { - dmar_domain = irq_domain_add_tree(NULL, &dmar_domain_ops, NULL); - if (dmar_domain) - dmar_domain->parent = x86_vector_domain; - } + if (dmar_domain == NULL) + dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info, + x86_vector_domain); mutex_unlock(&dmar_lock); return dmar_domain; @@ -309,23 +256,9 @@ void dmar_free_hwirq(int irq) #ifdef CONFIG_HPET_TIMER static inline int hpet_dev_id(struct irq_domain *domain) { - return (int)(long)domain->host_data; -} + struct msi_domain_info *info = msi_get_domain_info(domain); -static int hpet_msi_set_affinity(struct irq_data *data, - const struct cpumask *mask, bool force) -{ - struct irq_data *parent = data->parent_data; - struct msi_msg msg; - int ret; - - ret = parent->chip->irq_set_affinity(parent, mask, force); - if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - irq_chip_compose_msi_msg(data, &msg); - hpet_msi_write(data->handler_data, &msg); - } - - return ret; + return (int)(long)info->data; } static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) @@ -338,79 +271,63 @@ static struct irq_chip hpet_msi_controller = { .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = hpet_msi_set_affinity, + .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) -{ - struct irq_alloc_info *info = arg; - int ret; - - if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_HPET) - return -EINVAL; - if (irq_find_mapping(domain, info->hpet_index)) { - pr_warn("IRQ for HPET%d already exists.\n", info->hpet_index); - return -EEXIST; - } - - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret >= 0) { - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_hwirq_and_chip(domain, virq, info->hpet_index, - &hpet_msi_controller, NULL); - irq_set_handler_data(virq, info->hpet_data); - __irq_set_handler(virq, handle_edge_irq, 0, "edge"); - } - - return ret; -} - -static void hpet_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) +static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) { - BUG_ON(nr_irqs > 1); - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_free_irqs_top(domain, virq, nr_irqs); + return arg->hpet_index; } -static void hpet_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - struct msi_msg msg; + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL, + handle_edge_irq, arg->hpet_data, "edge"); - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); + return 0; } -static void hpet_domain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data) +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) { - struct msi_msg msg; - - memset(&msg, 0, sizeof(msg)); - hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); } -static struct irq_domain_ops hpet_domain_ops = { - .alloc = hpet_domain_alloc, - .free = hpet_domain_free, - .activate = hpet_domain_activate, - .deactivate = hpet_domain_deactivate, +static struct msi_domain_ops hpet_msi_domain_ops = { + .get_hwirq = hpet_msi_get_hwirq, + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, }; struct irq_domain *hpet_create_irq_domain(int hpet_id) { struct irq_domain *parent; struct irq_alloc_info info; + struct msi_domain_info *domain_info; if (x86_vector_domain == NULL) return NULL; + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_HPET; info.hpet_id = hpet_id; @@ -420,8 +337,7 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) else hpet_msi_controller.name = "IR-HPET-MSI"; - return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, - (void *)(long)hpet_id); + return msi_create_irq_domain(NULL, domain_info, parent); } int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, -- cgit v1.2.1 From 0cddfc79462423bf86cbe34560bad07f4a25ded6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:50 +0800 Subject: irq_remapping: Remove unused function irq_remapping_print_chip() Now there's no user of irq_remapping_print_chip() anymore, so remove it. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-29-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index bacac1052262..52e1a1f337b2 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -59,7 +59,6 @@ extern struct irq_domain * irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info); extern struct irq_domain * irq_remapping_get_irq_domain(struct irq_alloc_info *info); -extern void irq_remapping_print_chip(struct irq_data *data, struct seq_file *p); /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent); @@ -115,6 +114,5 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info) return NULL; } -#define irq_remapping_print_chip NULL #endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_IRQ_REMAPPING_H */ -- cgit v1.2.1 From 6648d1b42c349d748839d7bad91cc8a65c73e262 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Apr 2015 14:11:51 +0800 Subject: x86/intel-mid: Delay initialization of APB timer MID has no PIC, but depending on the platform it requires the abt_timer, which is connected to irq0. The timer is set up at late_time_init(). But, looking at the MID code it seems, that there is no reason to do so. The only code which might need the timer working is the TSC calibration code, but thats a non issue on MID as that is using its own empty calibration function. And check_timer() is not invoked either because MID has no PIC and therefor no legacy irqs. So if you look at intel_mid_time_init() then you'll see that in the ARAT case the timer setup is skipped already. So until the point where x86_init.timers.setup_percpu_clockev() is called for the boot cpu nothing really needs a timer on MID. According to the MID code the apbt horror is only used for moorestown. Medfield and later use the local apic timer without the apbt nonsense. The best thing we can do is to drop moorestown support and get rid of that apbt nonsense alltogether. I don't think anyone deeply cares about it not being supported from 3.18 on. The number of devices which sport a moorestown should be pretty limited and the only relevant use case of those is to act as a pocket heater with short battery life time. Its pretty pointless to update kernels on pocket heaters except for bragging reasons. If someone at Intel really thinks that we need to keep moorestown alive for other than documentary and sentimental reasons, then we can move the apbt setup to x86_init.timers.setup_percpu_clockev(). At that point the IOAPIC is setup already, so it should just work. Signed-off-by: Thomas Gleixner Tested-by: Andy Shevchenko Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Kuppuswamy Sathyanarayanan Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Rickard Strandqvist Link: http://lkml.kernel.org/r/1428905519-23704-30-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apb_timer.c | 4 ---- arch/x86/platform/intel-mid/intel-mid.c | 18 +++++++++++++----- arch/x86/platform/intel-mid/sfi.c | 2 -- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 6a7c23ff21d3..ede92c3364d3 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void) static void apbt_setup_irq(struct apbt_dev *adev) { - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); } diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 3005f0c89f2e..01d54ea766c1 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -81,26 +81,34 @@ static unsigned long __init intel_mid_calibrate_tsc(void) return 0; } +static void __init intel_mid_setup_bp_timer(void) +{ + apbt_time_init(); + setup_boot_APIC_clock(); +} + static void __init intel_mid_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + switch (intel_mid_timer_options) { case INTEL_MID_TIMER_APBT_ONLY: break; case INTEL_MID_TIMER_LAPIC_APBT: - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + /* Use apbt and local apic */ + x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer; x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - break; + return; default: if (!boot_cpu_has(X86_FEATURE_ARAT)) break; + /* Lapic only, no apbt */ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; return; } - /* we need at least one APB timer */ - pre_init_apic_IRQ0(); - apbt_time_init(); + + x86_init.timers.setup_percpu_clockev = apbt_time_init; } static void intel_mid_arch_setup(void) diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index c14ad34776c4..aa59f88868f8 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -95,8 +95,6 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->freq_hz, pentry->irq); - if (!pentry->irq) - continue; mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ -- cgit v1.2.1 From b0415817cb7960f408da51e6fedecc6a19e2b895 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:52 +0800 Subject: x86/intel-mid, trivial: Refine code syntax for sfi_parse_mtmr() Correctly indent code in function sfi_parse_mtmr(). Signed-off-by: Jiang Liu Tested-by: Andy Shevchenko Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Jiri Kosina Link: http://lkml.kernel.org/r/1428905519-23704-31-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/platform/intel-mid/sfi.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index aa59f88868f8..9a16749935d4 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -95,16 +95,16 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->freq_hz, pentry->irq); - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; -/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ - mp_irq.irqflag = 5; - mp_irq.srcbus = MP_BUS_ISA; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = MP_BUS_ISA; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + mp_save_irq(&mp_irq); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); } return 0; -- cgit v1.2.1 From 4e69d7eab4c24aa88fb0ec99fad7feac254d9ece Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:53 +0800 Subject: x86/irq: Remove unused pre_init_apic_IRQ0() Now there's no user of pre_init_apic_IRQ0(), so remove it. Signed-off-by: Jiang Liu Tested-by: Andy Shevchenko Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Jan Beulich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-32-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 1 - arch/x86/kernel/apic/io_apic.c | 17 ----------------- 2 files changed, 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 2f91685fe1cd..c976de126a91 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -204,7 +204,6 @@ extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq); extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); -extern void __init pre_init_apic_IRQ0(void); extern void mp_save_irq(struct mpc_intsrc *m); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 56d532106ef3..540598c77e55 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3091,20 +3091,3 @@ int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) return ret; } - -/* Enable IOAPIC early just for system timer */ -void __init pre_init_apic_IRQ0(void) -{ - struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; - - printk(KERN_INFO "Early APIC setup for system timer0\n"); -#ifndef CONFIG_SMP - physid_set_mask_of_physid(boot_cpu_physical_apicid, - &phys_cpu_present_map); -#endif - setup_local_APIC(); - - io_apic_setup_irq_pin(0, 0, &attr); - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); -} -- cgit v1.2.1 From c4d05a2c354b15965c9b2a5f46016a5d9f43e224 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:54 +0800 Subject: x86/irq: Prepare IOAPIC interfaces to support hierarchical irqdomains Introduce helper functions to manipulate struct irq_alloc_info for IOAPIC. Also add an extra parameter to IOAPIC interfaces to prepare for hierarchical irqdomain. Function mp_set_gsi_attr() will be removed once we have switched to hierarchical irqdomains. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Jan Beulich Cc: Grant Likely Cc: David Cohen Link: http://lkml.kernel.org/r/1428905519-23704-33-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 14 ++++++-- arch/x86/kernel/acpi/boot.c | 6 ++-- arch/x86/kernel/apic/io_apic.c | 39 ++++++++++++++-------- arch/x86/pci/intel_mid_pci.c | 4 ++- .../platform/intel-mid/device_libs/platform_wdt.c | 4 ++- arch/x86/platform/intel-mid/sfi.c | 9 +++-- 6 files changed, 54 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index c976de126a91..1fbeda50a77d 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -95,6 +95,8 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); +struct irq_alloc_info; + #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 @@ -194,7 +196,8 @@ extern u32 gsi_top; extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); extern u32 mp_pin_to_gsi(int ioapic, int pin); -extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags); +extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info); extern void mp_unmap_irq(int irq); extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); @@ -203,6 +206,8 @@ extern int mp_ioapic_registered(u32 gsi_base); extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq); extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); +extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, + int node, int trigger, int polarity); extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); extern void mp_save_irq(struct mpc_intsrc *m); @@ -253,7 +258,12 @@ static inline void print_IO_APICs(void) {} #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } -static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; } +static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info) +{ + return gsi; +} + static inline void mp_unmap_irq(int irq) { } static inline int save_ioapic_entries(void) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 803b684676ff..a43a4d3c60e1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -404,6 +404,7 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { int irq, node; + struct irq_alloc_info info; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -416,7 +417,8 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, return -1; } - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); + ioapic_set_alloc_attr(&info, node, trigger, polarity); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) return irq; @@ -434,7 +436,7 @@ static void mp_unregister_gsi(u32 gsi) if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return; - irq = mp_map_gsi_to_irq(gsi, 0); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); if (irq > 0) mp_unmap_irq(irq); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 540598c77e55..5c953bb96ecf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -938,7 +938,19 @@ static int irq_trigger(int idx) return trigger; } -static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) +void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, + int trigger, int polarity) +{ + init_irq_alloc_info(info, NULL); + info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info->ioapic_node = node; + info->ioapic_trigger = trigger; + info->ioapic_polarity = polarity; + info->ioapic_valid = 1; +} + +static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, + struct irq_alloc_info *info) { int irq = -1; int ioapic = (int)(long)domain->host_data; @@ -971,11 +983,11 @@ static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) } static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, - unsigned int flags) + unsigned int flags, struct irq_alloc_info *info) { int irq; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); - struct mp_pin_info *info = mp_pin_info(ioapic, pin); + struct mp_pin_info *pinfo = mp_pin_info(ioapic, pin); if (!domain) return -1; @@ -997,30 +1009,30 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; if (flags & IOAPIC_MAP_ALLOC) { - if (info->count == 0 && + if (pinfo->count == 0 && mp_irqdomain_map(domain, irq, pin) != 0) irq = -1; /* special handling for timer IRQ0 */ if (irq == 0) - info->count++; + pinfo->count++; } } else { irq = irq_find_mapping(domain, pin); if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) - irq = alloc_irq_from_domain(domain, gsi, pin); + irq = alloc_irq_from_domain(domain, gsi, pin, info); } if (flags & IOAPIC_MAP_ALLOC) { /* special handling for legacy IRQs */ - if (irq < nr_legacy_irqs() && info->count == 1 && + if (irq < nr_legacy_irqs() && pinfo->count == 1 && mp_irqdomain_map(domain, irq, pin) != 0) irq = -1; if (irq > 0) - info->count++; - else if (info->count == 0) - info->set = 0; + pinfo->count++; + else if (pinfo->count == 0) + pinfo->set = 0; } mutex_unlock(&ioapic_mutex); @@ -1058,10 +1070,11 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) } #endif - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); } -int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info) { int ioapic, pin, idx; @@ -1074,7 +1087,7 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) if ((flags & IOAPIC_MAP_CHECK) && idx < 0) return -1; - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); } void mp_unmap_irq(int irq) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 852aa4c92da0..57b5719b617b 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -208,6 +208,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, static int intel_mid_pci_irq_enable(struct pci_dev *dev) { + struct irq_alloc_info info; int polarity; if (dev->irq_managed && dev->irq > 0) @@ -217,6 +218,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) polarity = 0; /* active high */ else polarity = 1; /* active low */ + ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity); /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to @@ -224,7 +226,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) */ if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev))) return -EBUSY; - if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0) + if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0) return -EBUSY; dev->irq_managed = 1; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c index 0b283d4d0ad7..de0009f6d555 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c @@ -27,6 +27,7 @@ static struct platform_device wdt_dev = { static int tangier_probe(struct platform_device *pdev) { int gsi; + struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; if (!pdata) @@ -34,8 +35,9 @@ static int tangier_probe(struct platform_device *pdev) /* IOAPIC builds identity mapping between GSI and IRQ on MID */ gsi = pdata->irq; + ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) || - mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) { + mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); return -EINVAL; diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 9a16749935d4..7d17355d820e 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -104,7 +104,7 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); } return 0; @@ -175,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); } return 0; } @@ -434,6 +434,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) struct devs_id *dev = NULL; int num, i, ret; int polarity; + struct irq_alloc_info info; sb = (struct sfi_table_simple *)table; num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); @@ -467,9 +468,11 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) polarity = 1; } + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity); ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE); if (ret == 0) - ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC); + ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, + &info); WARN_ON(ret < 0); } -- cgit v1.2.1 From 49c7e60022912d10da88ba67e8eb2927f1143f6a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:55 +0800 Subject: x86/irq: Implement callbacks to enable hierarchical irqdomains on IOAPICs Implement required callbacks to prepare for enabling hierarchical irqdomains on IOAPICs. After the conversion we can remove quite some code from the old implementation. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Jan Beulich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-34-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 10 +++ arch/x86/kernel/apic/io_apic.c | 159 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 166 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 1fbeda50a77d..ecc192624eaf 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -96,6 +96,7 @@ struct IR_IO_APIC_route_entry { } __attribute__ ((packed)); struct irq_alloc_info; +struct irq_data; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -206,6 +207,15 @@ extern int mp_ioapic_registered(u32 gsi_base); extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq); extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); +extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg); +extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs); +extern void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data); +extern void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data); +extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity); extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5c953bb96ecf..3406dbec1570 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -78,6 +78,13 @@ static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; +struct mp_chip_data { + struct IO_APIC_route_entry entry; + int trigger; + int polarity; + bool isa_irq; +}; + struct mp_pin_info { int trigger; int polarity; @@ -949,11 +956,28 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, info->ioapic_valid = 1; } +static void mp_register_handler(unsigned int irq, unsigned long trigger) +{ + irq_flow_handler_t hdl; + bool fasteoi; + + if (trigger) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); +} + static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, struct irq_alloc_info *info) { int irq = -1; - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { @@ -3029,7 +3053,7 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); struct io_apic_irq_attr attr; @@ -3067,7 +3091,7 @@ void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) { struct irq_data *data = irq_get_irq_data(virq); struct irq_cfg *cfg = irq_cfg(virq); - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); int pin = (int)data->hwirq; ioapic_mask_entry(ioapic, pin); @@ -3076,6 +3100,130 @@ void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) arch_teardown_hwirq(virq); } +static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->ioapic_valid) { + data->trigger = info->ioapic_trigger; + data->polarity = info->ioapic_polarity; + } else if (acpi_get_override_irq(gsi, &data->trigger, + &data->polarity) < 0) { + /* PCI interrupts are always polarity one level triggered. */ + data->trigger = 1; + data->polarity = 1; + } +} + +static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, + struct IO_APIC_route_entry *entry) +{ + memset(entry, 0, sizeof(*entry)); + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = cfg->dest_apicid; + entry->vector = cfg->vector; + entry->mask = 0; /* enable IRQ */ + entry->trigger = data->trigger; + entry->polarity = data->polarity; + /* + * Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (data->trigger) + entry->mask = 1; +} + +int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + int ret, ioapic, pin; + struct irq_cfg *cfg; + struct irq_data *irq_data; + struct mp_chip_data *data; + struct irq_alloc_info *info = arg; + + if (!info || nr_irqs > 1) + return -EINVAL; + irq_data = irq_domain_get_irq_data(domain, virq); + if (!irq_data) + return -EINVAL; + + ioapic = mp_irqdomain_ioapic_idx(domain); + pin = info->ioapic_pin; + if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + return -EEXIST; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + info->ioapic_entry = &data->entry; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(data); + return ret; + } + + irq_data->hwirq = info->ioapic_pin; + irq_data->chip = &ioapic_chip; + irq_data->chip_data = data; + mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); + + cfg = irqd_cfg(irq_data); + add_pin_to_irq_node(cfg, info->ioapic_node, ioapic, pin); + if (info->ioapic_entry) + mp_setup_entry(cfg, data, info->ioapic_entry); + mp_register_handler(virq, data->trigger); + if (virq < nr_legacy_irqs()) + legacy_pic->mask(virq); + + apic_printk(APIC_VERBOSE, KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", + ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, + virq, data->trigger, data->polarity, cfg->dest_apicid); + + return 0; +} + +void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_cfg *cfg = irq_cfg(virq); + struct irq_data *irq_data; + + BUG_ON(nr_irqs != 1); + irq_data = irq_domain_get_irq_data(domain, virq); + if (irq_data && irq_data->chip_data) { + __remove_pin_from_irq(cfg, mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); + WARN_ON(!list_empty(&cfg->irq_2_pin)); + kfree(irq_data->chip_data); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + unsigned long flags; + struct irq_pin_list *entry; + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, data->entry); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + /* It won't be called for IRQ with multiple IOAPIC pins associated */ + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); +} + int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) { int ret = 0; @@ -3104,3 +3252,8 @@ int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) return ret; } + +int mp_irqdomain_ioapic_idx(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} -- cgit v1.2.1 From 133153205b263ea9ce4e771876ede544f896e034 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:56 +0800 Subject: x86/irq: Refine the way to allocate irq_cfg for legacy IRQs To support legacy ISA IRQs, we need to preallocate irq_cfg structures for legacy ISA IRQs. Refine the way to allocate irq_cfg for legacy ISA IRQs, so it's more friendly for the hierarchical irqdomain implementation. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-35-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 13 +------------ arch/x86/kernel/apic/vector.c | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3406dbec1570..16d4ba3ac844 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -254,8 +254,7 @@ static void free_ioapic_saved_registers(int idx) int __init arch_early_ioapic_init(void) { - struct irq_cfg *cfg; - int i, node = cpu_to_node(0); + int i; if (!nr_legacy_irqs()) io_apic_irqs = ~0UL; @@ -263,16 +262,6 @@ int __init arch_early_ioapic_init(void) for_each_ioapic(i) alloc_ioapic_saved_registers(i); - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. - */ - for (i = 0; i < nr_legacy_irqs(); i++) { - cfg = alloc_irq_and_cfg_at(i, node); - cfg->vector = IRQ0_VECTOR + i; - cpumask_setall(cfg->domain); - } - return 0; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b4b6b5a13440..633f03268d48 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -24,6 +24,9 @@ struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); static struct irq_chip lapic_controller; +#ifdef CONFIG_X86_IO_APIC +static struct irq_cfg *legacy_irq_cfgs[NR_IRQS_LEGACY]; +#endif void lock_vector_lock(void) { @@ -283,6 +286,10 @@ static void x86_vector_free_irqs(struct irq_domain *domain, free_remapped_irq(virq); clear_irq_vector(virq + i, irq_data->chip_data); free_irq_cfg(irq_data->chip_data); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs()) + legacy_irq_cfgs[virq + i] = NULL; +#endif irq_domain_reset_irq_data(irq_data); } } @@ -308,7 +315,12 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); - cfg = alloc_irq_cfg(irq_data->node); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs() && legacy_irq_cfgs[virq + i]) + cfg = legacy_irq_cfgs[virq + i]; + else +#endif + cfg = alloc_irq_cfg(irq_data->node); if (!cfg) { err = -ENOMEM; goto error; @@ -357,8 +369,36 @@ int __init arch_probe_nr_irqs(void) return nr_legacy_irqs(); } +#ifdef CONFIG_X86_IO_APIC +static void init_legacy_irqs(void) +{ + int i, node = cpu_to_node(0); + struct irq_cfg *cfg; + + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + cfg = legacy_irq_cfgs[i] = alloc_irq_cfg(node); + BUG_ON(!cfg); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + cfg->vector = IRQ0_VECTOR + i; + cpumask_setall(cfg->domain); + irq_set_chip_data(i, cfg); + } +} +#else +static void init_legacy_irqs(void) { } +#endif + int __init arch_early_irq_init(void) { + init_legacy_irqs(); + x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); -- cgit v1.2.1 From a44174ee7b380012cdb63d563617f67bb7757649 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:57 +0800 Subject: x86/irq: Simplify the way to print IOAPIC entry Simplify the way to print IOAPIC entry content, so we can remove native_io_apic_print_entries(), intel_ir_io_apic_print_entries() and x86_io_apic_ops.print_entries() later. Folded a patch from Thomas to fix errors in printed pin attributes, http://www.spinics.net/lists/linux-tip-commits/msg26108.html Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-36-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 16d4ba3ac844..3c6609617306 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1424,6 +1424,33 @@ void ioapic_zap_locks(void) raw_spin_lock_init(&ioapic_lock); } +static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) +{ + int i; + char buf[256]; + struct IO_APIC_route_entry entry; + struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + + printk(KERN_DEBUG "IOAPIC %d:\n", apic); + for (i = 0; i <= nr_entries; i++) { + entry = ioapic_read_entry(apic, i); + snprintf(buf, sizeof(buf), + " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, entry.mask ? "disabled" : "enabled ", + entry.trigger ? "level" : "edge ", + entry.polarity ? "low " : "high", + entry.vector, entry.irr, entry.delivery_status); + if (ir_entry->format) + printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", + buf, (ir_entry->index << 15) | ir_entry->index, + ir_entry->zero); + else + printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", + buf, entry.dest_mode ? "logical " : "physical", + entry.dest, entry.delivery_mode); + } +} + static void __init print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -1477,8 +1504,7 @@ static void __init print_IO_APIC(int ioapic_idx) } printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries); + io_apic_print_entries(ioapic_idx, reg_01.bits.entries); } void __init print_IO_APICs(void) -- cgit v1.2.1 From 96ed44b2d5e0e9d6e5b135e84ea5c8cd763ce861 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:58 +0800 Subject: x86/irq: Introduce helper functions to support hierarchical irqdomains for IOAPIC Introduce several helper functions, which will be used to enable hierarchical irqdomain for IOAPIC. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-37-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 61 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c6609617306..c8f786b5b91c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -82,6 +82,7 @@ struct mp_chip_data { struct IO_APIC_route_entry entry; int trigger; int polarity; + u32 count; bool isa_irq; }; @@ -945,6 +946,46 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, info->ioapic_valid = 1; } +#ifndef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); +#endif + +static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, + struct irq_alloc_info *src, + u32 gsi, int ioapic_idx, int pin) +{ + int trigger, polarity; + + copy_irq_alloc_info(dst, src); + dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + dst->ioapic_id = mpc_ioapic_id(ioapic_idx); + dst->ioapic_pin = pin; + dst->ioapic_valid = 1; + if (src && src->ioapic_valid) { + dst->ioapic_node = src->ioapic_node; + dst->ioapic_trigger = src->ioapic_trigger; + dst->ioapic_polarity = src->ioapic_polarity; + } else { + dst->ioapic_node = NUMA_NO_NODE; + if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { + dst->ioapic_trigger = trigger; + dst->ioapic_polarity = polarity; + } else { + /* + * PCI interrupts are always polarity one level + * triggered. + */ + dst->ioapic_trigger = 1; + dst->ioapic_polarity = 1; + } + } +} + +static int ioapic_alloc_attr_node(struct irq_alloc_info *info) +{ + return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE; +} + static void mp_register_handler(unsigned int irq, unsigned long trigger) { irq_flow_handler_t hdl; @@ -962,6 +1003,26 @@ static void mp_register_handler(unsigned int irq, unsigned long trigger) __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); } +static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) +{ + struct mp_chip_data *data = irq_get_chip_data(irq); + + /* + * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger + * and polarity attirbutes. So allow the first user to reprogram the + * pin with real trigger and polarity attributes. + */ + if (irq < nr_legacy_irqs() && data->count == 1) { + if (info->ioapic_trigger != data->trigger) + mp_register_handler(irq, data->trigger); + data->entry.trigger = data->trigger = info->ioapic_trigger; + data->entry.polarity = data->polarity = info->ioapic_polarity; + } + + return data->trigger == info->ioapic_trigger && + data->polarity == info->ioapic_polarity; +} + static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, struct irq_alloc_info *info) { -- cgit v1.2.1 From d32932d02e1869be838cea3ace42467c360db377 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:59 +0800 Subject: x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces Convert IOAPIC driver to support and use hierarchical irqdomain interfaces. It's a little big, but would break bisecting if we split it into multiple patches. Fold in a patch from Andy Shevchenko to make it bisectable. http://lkml.org/lkml/2014/12/10/622 Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Andy Shevchenko Cc: sfi-devel@simplefirmware.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Grant Likely Cc: Rob Herring Cc: David Rientjes Cc: David Cohen Link: http://lkml.kernel.org/r/1428905519-23704-38-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 11 +- arch/x86/kernel/apic/io_apic.c | 308 ++++++++++++++------- arch/x86/kernel/devicetree.c | 37 +-- arch/x86/kernel/mpparse.c | 6 +- arch/x86/pci/intel_mid_pci.c | 2 - .../platform/intel-mid/device_libs/platform_wdt.c | 3 +- arch/x86/platform/intel-mid/sfi.c | 5 +- arch/x86/platform/sfi/sfi.c | 5 +- 8 files changed, 240 insertions(+), 137 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a43a4d3c60e1..21e460b3b360 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -412,11 +412,6 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - if (mp_set_gsi_attr(gsi, trigger, polarity, node)) { - pr_warn("Failed to set pin attr for GSI%d\n", gsi); - return -1; - } - ioapic_set_alloc_attr(&info, node, trigger, polarity); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) @@ -442,8 +437,10 @@ static void mp_unregister_gsi(u32 gsi) } static struct irq_domain_ops acpi_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static int __init diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c8f786b5b91c..ba50f8d6f6b0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -142,6 +142,11 @@ u32 mp_pin_to_gsi(int ioapic, int pin) return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; } +static inline bool mp_is_legacy_irq(int irq) +{ + return irq >= 0 && irq < nr_legacy_irqs(); +} + /* * Initialize all legacy IRQs and all pins on the first IOAPIC * if we have legacy interrupt controller. Kernel boot option "pirq=" @@ -152,7 +157,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq) if (!nr_legacy_irqs()) return 0; - return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs()); + return ioapic == 0 || mp_is_legacy_irq(irq); } static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) @@ -231,7 +236,7 @@ struct irq_pin_list { static struct irq_pin_list *alloc_irq_pin_list(int node) { - return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); + return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); } static void alloc_ioapic_saved_registers(int idx) @@ -560,6 +565,17 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector) } } +void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) +{ + unsigned long flags; + struct irq_pin_list *entry; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + native_eoi_ioapic_pin(entry->apic, entry->pin, vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { struct irq_pin_list *entry; @@ -603,9 +619,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry.trigger = IOAPIC_LEVEL; ioapic_write_entry(apic, pin, entry); } - raw_spin_lock_irqsave(&ioapic_lock, flags); - x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector); + native_eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1023,95 +1038,121 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) data->polarity == info->ioapic_polarity; } -static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, +static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, struct irq_alloc_info *info) { + bool legacy = false; int irq = -1; - int ioapic = mp_irqdomain_ioapic_idx(domain); int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { case IOAPIC_DOMAIN_LEGACY: /* - * Dynamically allocate IRQ number for non-ISA IRQs in the first 16 - * GSIs on some weird platforms. + * Dynamically allocate IRQ number for non-ISA IRQs in the first + * 16 GSIs on some weird platforms. */ - if (gsi < nr_legacy_irqs()) - irq = irq_create_mapping(domain, pin); - else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + if (!ioapic_initialized || gsi >= nr_legacy_irqs()) irq = gsi; + legacy = mp_is_legacy_irq(irq); break; case IOAPIC_DOMAIN_STRICT: - if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) - irq = gsi; + irq = gsi; break; case IOAPIC_DOMAIN_DYNAMIC: - irq = irq_create_mapping(domain, pin); break; default: WARN(1, "ioapic: unknown irqdomain type %d\n", type); - break; + return -1; + } + + return __irq_domain_alloc_irqs(domain, irq, 1, + ioapic_alloc_attr_node(info), + info, legacy); +} + +/* + * Need special handling for ISA IRQs because there may be multiple IOAPIC pins + * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping + * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are + * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and + * some BIOSes may use MP Interrupt Source records to override IRQ numbers for + * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be + * multiple pins sharing the same legacy IRQ number when ACPI is disabled. + */ +static int alloc_isa_irq_from_domain(struct irq_domain *domain, + int irq, int ioapic, int pin, + struct irq_alloc_info *info) +{ + struct mp_chip_data *data; + struct irq_data *irq_data = irq_get_irq_data(irq); + int node = ioapic_alloc_attr_node(info); + + /* + * Legacy ISA IRQ has already been allocated, just add pin to + * the pin list assoicated with this IRQ and program the IOAPIC + * entry. The IOAPIC entry + */ + if (irq_data && irq_data->parent_data) { + struct irq_cfg *cfg = irqd_cfg(irq_data); + + if (!mp_check_pin_attr(irq, info)) + return -EBUSY; + if (__add_pin_to_irq_node(cfg, node, ioapic, info->ioapic_pin)) + return -ENOMEM; + } else { + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + if (irq >= 0) { + irq_data = irq_domain_get_irq_data(domain, irq); + data = irq_data->chip_data; + data->isa_irq = true; + } } - return irq > 0 ? irq : -1; + return irq; } static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, unsigned int flags, struct irq_alloc_info *info) { int irq; + bool legacy = false; + struct irq_alloc_info tmp; + struct mp_chip_data *data; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); - struct mp_pin_info *pinfo = mp_pin_info(ioapic, pin); if (!domain) - return -1; + return -ENOSYS; - mutex_lock(&ioapic_mutex); - - /* - * Don't use irqdomain to manage ISA IRQs because there may be - * multiple IOAPIC pins sharing the same ISA IRQ number and - * irqdomain only supports 1:1 mapping between IOAPIC pin and - * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used - * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). - * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are - * available, and some BIOSes may use MP Interrupt Source records - * to override IRQ numbers for PIRQs instead of reprogramming - * the interrupt routing logic. Thus there may be multiple pins - * sharing the same legacy IRQ number when ACPI is disabled. - */ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; - if (flags & IOAPIC_MAP_ALLOC) { - if (pinfo->count == 0 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; + legacy = mp_is_legacy_irq(irq); + } - /* special handling for timer IRQ0 */ + mutex_lock(&ioapic_mutex); + if (!(flags & IOAPIC_MAP_ALLOC)) { + if (!legacy) { + irq = irq_find_mapping(domain, pin); if (irq == 0) - pinfo->count++; + irq = -ENOENT; } } else { - irq = irq_find_mapping(domain, pin); - if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) - irq = alloc_irq_from_domain(domain, gsi, pin, info); - } - - if (flags & IOAPIC_MAP_ALLOC) { - /* special handling for legacy IRQs */ - if (irq < nr_legacy_irqs() && pinfo->count == 1 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; - - if (irq > 0) - pinfo->count++; - else if (pinfo->count == 0) - pinfo->set = 0; + ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin); + if (legacy) + irq = alloc_isa_irq_from_domain(domain, irq, + ioapic, pin, &tmp); + else if ((irq = irq_find_mapping(domain, pin)) == 0) + irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp); + else if (!mp_check_pin_attr(irq, &tmp)) + irq = -EBUSY; + if (irq >= 0) { + data = irq_get_chip_data(irq); + data->count++; + } } - mutex_unlock(&ioapic_mutex); - return irq > 0 ? irq : -1; + return irq; } static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) @@ -1166,26 +1207,19 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, void mp_unmap_irq(int irq) { - struct irq_data *data = irq_get_irq_data(irq); - struct mp_pin_info *info; - int ioapic, pin; + struct irq_data *irq_data = irq_get_irq_data(irq); + struct mp_chip_data *data; - if (!data || !data->domain) + if (!irq_data || !irq_data->domain) return; - ioapic = (int)(long)data->domain->host_data; - pin = (int)data->hwirq; - info = mp_pin_info(ioapic, pin); + data = irq_data->chip_data; + if (!data || data->isa_irq) + return; mutex_lock(&ioapic_mutex); - if (--info->count == 0) { - info->set = 0; - if (irq < nr_legacy_irqs() && - ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY) - mp_irqdomain_unmap(data->domain, irq); - else - irq_dispose_mapping(irq); - } + if (--data->count == 0) + irq_domain_free_irqs(irq, 1); mutex_unlock(&ioapic_mutex); } @@ -1252,7 +1286,7 @@ out: } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -static struct irq_chip ioapic_chip; +static struct irq_chip ioapic_chip, ioapic_ir_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) @@ -1595,7 +1629,7 @@ void __init print_IO_APICs(void) struct irq_pin_list *entry; chip = irq_get_chip(irq); - if (chip != &ioapic_chip) + if (chip != &ioapic_chip && chip != &ioapic_ir_chip) continue; cfg = irq_cfg(irq); @@ -2057,12 +2091,12 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, } #endif -static void ack_ioapic_level(struct irq_data *data) +static void ioapic_ack_level(struct irq_data *data) { struct irq_cfg *cfg = irqd_cfg(data); - int i, irq = data->irq; unsigned long v; bool masked; + int i; irq_complete_move(cfg); masked = ioapic_irqd_mask(data, cfg); @@ -2117,22 +2151,70 @@ static void ack_ioapic_level(struct irq_data *data) */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - - eoi_ioapic_irq(irq, cfg); + eoi_ioapic_pin(cfg->vector, cfg); } ioapic_irqd_unmask(data, cfg, masked); } +static void ioapic_ir_ack_level(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + ack_APIC_irq(); + eoi_ioapic_pin(data->entry.vector, irqd_cfg(irq_data)); +} + +static int ioapic_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct mp_chip_data *data = irq_data->chip_data; + unsigned int dest, irq = irq_data->irq; + struct irq_cfg *cfg; + unsigned long flags; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + cfg = irqd_cfg(irq_data); + data->entry.dest = cfg->dest_apicid; + data->entry.vector = cfg->vector; + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); + __target_IO_APIC_irq(irq, dest, cfg); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .irq_startup = startup_ioapic_irq, .irq_mask = mask_ioapic_irq, .irq_unmask = unmask_ioapic_irq, - .irq_ack = apic_ack_edge, - .irq_eoi = ack_ioapic_level, - .irq_set_affinity = native_ioapic_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct irq_chip ioapic_ir_chip __read_mostly = { + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ir_ack_level, + .irq_set_affinity = ioapic_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2265,6 +2347,24 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); +static int mp_alloc_timer_irq(int ioapic, int pin) +{ + int irq = -1; + struct irq_alloc_info info; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (domain) { + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); + info.ioapic_id = mpc_ioapic_id(ioapic); + info.ioapic_pin = pin; + mutex_lock(&ioapic_mutex); + irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); + mutex_unlock(&ioapic_mutex); + } + + return irq; +} + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2287,7 +2387,6 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ legacy_pic->mask(0); - assign_irq_vector(0, cfg, apic->target_cpus()); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2328,15 +2427,12 @@ static inline void __init check_timer(void) } if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ + /* Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_node(cfg, node, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + mp_alloc_timer_irq(apic1, pin1); } else { - /* for edge trigger, setup_ioapic_irq already - * leave it unmasked. + /* + * for edge trigger, it's already unmasked, * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ @@ -2345,6 +2441,7 @@ static inline void __init check_timer(void) if (idx != -1 && irq_trigger(idx)) unmask_ioapic(cfg); } + irq_domain_activate_irq(irq_get_irq_data(0)); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2365,7 +2462,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + irq_domain_activate_irq(irq_get_irq_data(0)); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2443,6 +2540,8 @@ out: static int mp_irqdomain_create(int ioapic) { size_t size; + struct irq_alloc_info info; + struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; @@ -2456,9 +2555,18 @@ static int mp_irqdomain_create(int ioapic) if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info.ioapic_id = mpc_ioapic_id(ioapic); + parent = irq_remapping_get_ir_irq_domain(&info); + if (!parent) + parent = x86_vector_domain; + ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, (void *)(long)ioapic); - if(!ip->irqdomain) { + if (ip->irqdomain) { + ip->irqdomain->parent = parent; + } else { kfree(ip->pin_info); ip->pin_info = NULL; return -ENOMEM; @@ -3072,7 +3180,6 @@ int mp_unregister_ioapic(u32 gsi_base) { int ioapic, pin; int found = 0; - struct mp_pin_info *pin_info; for_each_ioapic(ioapic) if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { @@ -3085,11 +3192,17 @@ int mp_unregister_ioapic(u32 gsi_base) } for_each_pin(ioapic, pin) { - pin_info = mp_pin_info(ioapic, pin); - if (pin_info->count) { - pr_warn("pin%d on IOAPIC%d is still in use.\n", - pin, ioapic); - return -EBUSY; + u32 gsi = mp_pin_to_gsi(ioapic, pin); + int irq = mp_map_gsi_to_irq(gsi, 0, NULL); + struct mp_chip_data *data; + + if (irq >= 0) { + data = irq_get_chip_data(irq); + if (data && data->count) { + pr_warn("pin%d on IOAPIC%d is still in use.\n", + pin, ioapic); + return -EBUSY; + } } } @@ -3241,7 +3354,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, } irq_data->hwirq = info->ioapic_pin; - irq_data->chip = &ioapic_chip; + irq_data->chip = (domain->parent == x86_vector_domain) ? + &ioapic_chip : &ioapic_ir_chip; irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 6367a780cc8c..05103d398ed7 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *domain, - struct device_node *controller, - const u32 *intspec, u32 intsize, - irq_hw_number_t *out_hwirq, u32 *out_type) +static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { + struct of_phandle_args *irq_data = (void *)arg; struct of_ioapic_type *it; - u32 line, idx, gsi; + struct irq_alloc_info tmp; - if (WARN_ON(intsize < 2)) + if (WARN_ON(irq_data->args_count < 2)) return -EINVAL; - - line = intspec[0]; - - if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) + if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = &of_ioapic_type[intspec[1]]; + it = &of_ioapic_type[irq_data->args[1]]; + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic_pin = irq_data->args[0]; - idx = (u32)(long)domain->host_data; - gsi = mp_pin_to_gsi(idx, line); - if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0))) - return -EBUSY; - - *out_hwirq = line; - *out_type = it->out_type; - return 0; + return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } const struct irq_domain_ops ioapic_irq_domain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, - .xlate = ioapic_xlate, + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init dtb_add_ioapic(struct device_node *dn) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 2d2a237f2c73..aa4feee74dbe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -114,8 +114,10 @@ static void __init MP_bus_info(struct mpc_bus *m) } static struct irq_domain_ops mp_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init MP_ioapic_info(struct mpc_ioapic *m) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 57b5719b617b..27062303c881 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -224,8 +224,6 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev))) - return -EBUSY; if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0) return -EBUSY; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c index de0009f6d555..de734134bc8d 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c @@ -36,8 +36,7 @@ static int tangier_probe(struct platform_device *pdev) /* IOAPIC builds identity mapping between GSI and IRQ on MID */ gsi = pdata->irq; ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); - if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) || - mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { + if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); return -EINVAL; diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 7d17355d820e..ce992e8cc065 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -469,10 +469,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) } ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity); - ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE); - if (ret == 0) - ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, - &info); + ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info); WARN_ON(ret < 0); } diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index 2a8a74f3bd76..b66b194f9900 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -72,7 +72,10 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #ifdef CONFIG_X86_IO_APIC static struct irq_domain_ops sfi_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static int __init sfi_parse_ioapic(struct sfi_table_header *table) -- cgit v1.2.1 From 5ad274d41c1b3f3ccf73591078efaa8ed6828a8d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:38 +0800 Subject: x86/irq: Remove unused old IOAPIC irqdomain interfaces Now we have converted to hierarchical irqdomain, so remove unused old IOAPIC interfaces and code. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 4 - arch/x86/kernel/apic/io_apic.c | 202 +---------------------------------------- 2 files changed, 1 insertion(+), 205 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index ecc192624eaf..705c425c9c3d 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -204,9 +204,6 @@ extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); extern int mp_unregister_ioapic(u32 gsi_base); extern int mp_ioapic_registered(u32 gsi_base); -extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq); -extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg); extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, @@ -218,7 +215,6 @@ extern void mp_irqdomain_deactivate(struct irq_domain *domain, extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity); -extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); extern void mp_save_irq(struct mpc_intsrc *m); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ba50f8d6f6b0..523b326d71c2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -89,7 +89,6 @@ struct mp_chip_data { struct mp_pin_info { int trigger; int polarity; - int node; int set; u32 count; }; @@ -1310,30 +1309,6 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, - unsigned long trigger) -{ - struct irq_chip *chip = &ioapic_chip; - irq_flow_handler_t hdl; - bool fasteoi; - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { - irq_set_status_flags(irq, IRQ_LEVEL); - fasteoi = true; - } else { - irq_clear_status_flags(irq, IRQ_LEVEL); - fasteoi = false; - } - - if (setup_remapped_irq(irq, cfg, chip)) - fasteoi = trigger != 0; - - hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; - irq_set_chip_and_handler_name(irq, chip, hdl, - fasteoi ? "fasteoi" : "edge"); -} - int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr) @@ -1358,48 +1333,6 @@ int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, return 0; } -static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, - struct io_apic_irq_attr *attr) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - if (!IO_APIC_IRQ(irq)) - return; - - if (assign_irq_vector(irq, cfg, apic->target_cpus())) - return; - - if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), - &dest)) { - pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i Dest:%d)\n", - attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, - cfg->vector, irq, attr->trigger, attr->polarity, dest); - - if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - ioapic_register_intr(irq, cfg, attr->trigger); - if (irq < nr_legacy_irqs()) - legacy_pic->mask(irq); - - ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); -} - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; @@ -1419,46 +1352,6 @@ static void __init setup_IO_APIC_irqs(void) } } -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), - apic->target_cpus(), &dest))) - dest = BAD_APICID; - - entry.dest_mode = apic->irq_dest_mode; - entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = dest; - entry.delivery_mode = apic->irq_delivery_mode; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_idx, pin, entry); -} - void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { int i; @@ -2669,20 +2562,6 @@ static int __init ioapic_init_ops(void) device_initcall(ioapic_init_ops); -static int -io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) -{ - struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); - int ret; - - if (!cfg) - return -EINVAL; - ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); - if (!ret) - setup_ioapic_irq(irq, cfg, attr); - return ret; -} - static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3239,58 +3118,8 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, irq_attr->polarity = polarity; } -int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) -{ - int ioapic = mp_irqdomain_ioapic_idx(domain); - struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); - struct io_apic_irq_attr attr; - - /* Get default attribute if not set by caller yet */ - if (!info->set) { - u32 gsi = mp_pin_to_gsi(ioapic, hwirq); - - if (acpi_get_override_irq(gsi, &info->trigger, - &info->polarity) < 0) { - /* - * PCI interrupts are always polarity one level - * triggered. - */ - info->trigger = 1; - info->polarity = 1; - } - info->node = NUMA_NO_NODE; - - /* - * setup_IO_APIC_irqs() programs all legacy IRQs with default - * trigger and polarity attributes. Don't set the flag for that - * case so the first legacy IRQ user could reprogram the pin - * with real trigger and polarity attributes. - */ - if (virq >= nr_legacy_irqs() || info->count) - info->set = 1; - } - set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger, - info->polarity); - - return io_apic_setup_irq_pin(virq, info->node, &attr); -} - -void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) -{ - struct irq_data *data = irq_get_irq_data(virq); - struct irq_cfg *cfg = irq_cfg(virq); - int ioapic = mp_irqdomain_ioapic_idx(domain); - int pin = (int)data->hwirq; - - ioapic_mask_entry(ioapic, pin); - __remove_pin_from_irq(cfg, ioapic, pin); - WARN_ON(!list_empty(&cfg->irq_2_pin)); - arch_teardown_hwirq(virq); -} - static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, - struct irq_alloc_info *info) + struct irq_alloc_info *info) { if (info && info->ioapic_valid) { data->trigger = info->ioapic_trigger; @@ -3414,35 +3243,6 @@ void mp_irqdomain_deactivate(struct irq_domain *domain, (int)irq_data->hwirq); } -int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) -{ - int ret = 0; - int ioapic, pin; - struct mp_pin_info *info; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -ENODEV; - - pin = mp_find_ioapic_pin(ioapic, gsi); - info = mp_pin_info(ioapic, pin); - trigger = trigger ? 1 : 0; - polarity = polarity ? 1 : 0; - - mutex_lock(&ioapic_mutex); - if (!info->set) { - info->trigger = trigger; - info->polarity = polarity; - info->node = node; - info->set = 1; - } else if (info->trigger != trigger || info->polarity != polarity) { - ret = -EBUSY; - } - mutex_unlock(&ioapic_mutex); - - return ret; -} - int mp_irqdomain_ioapic_idx(struct irq_domain *domain) { return (int)(long)domain->host_data; -- cgit v1.2.1 From b75e818f7fc6db153a4ebfba1d31366c1cc531aa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:39 +0800 Subject: x86/irq: Remove unused struct mp_pin_info Now nobody makes use of struct mp_pin_info, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 523b326d71c2..3506e8aeba91 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -86,13 +86,6 @@ struct mp_chip_data { bool isa_irq; }; -struct mp_pin_info { - int trigger; - int polarity; - int set; - u32 count; -}; - static struct ioapic { /* * # of IRQ routing registers @@ -108,7 +101,6 @@ static struct ioapic { struct mp_ioapic_gsi gsi_config; struct ioapic_domain_cfg irqdomain_cfg; struct irq_domain *irqdomain; - struct mp_pin_info *pin_info; struct resource *iomem_res; } ioapics[MAX_IO_APICS]; @@ -159,11 +151,6 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq) return ioapic == 0 || mp_is_legacy_irq(irq); } -static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) -{ - return ioapics[ioapic_idx].pin_info + pin; -} - static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) { return ioapics[ioapic].irqdomain; @@ -2432,7 +2419,6 @@ out: static int mp_irqdomain_create(int ioapic) { - size_t size; struct irq_alloc_info info; struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); @@ -2440,11 +2426,6 @@ static int mp_irqdomain_create(int ioapic) struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); - size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic); - ip->pin_info = kzalloc(size, GFP_KERNEL); - if (!ip->pin_info) - return -ENOMEM; - if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; @@ -2457,13 +2438,10 @@ static int mp_irqdomain_create(int ioapic) ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, (void *)(long)ioapic); - if (ip->irqdomain) { - ip->irqdomain->parent = parent; - } else { - kfree(ip->pin_info); - ip->pin_info = NULL; + if (!ip->irqdomain) return -ENOMEM; - } + + ip->irqdomain->parent = parent; if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) @@ -2479,8 +2457,6 @@ static void ioapic_destroy_irqdomain(int idx) irq_domain_remove(ioapics[idx].irqdomain); ioapics[idx].irqdomain = NULL; } - kfree(ioapics[idx].pin_info); - ioapics[idx].pin_info = NULL; } void __init setup_IO_APIC(void) -- cgit v1.2.1 From 84bea5cc7709dffdadfa9885a66efd67d9ffc24c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:40 +0800 Subject: x86/irq: Remove x86_io_apic_ops.print_entries and related interfaces Now there is no user of x86_io_apic_ops.print_entries anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-4-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 3 --- arch/x86/include/asm/x86_init.h | 1 - arch/x86/kernel/apic/io_apic.c | 55 ----------------------------------------- arch/x86/kernel/x86_init.c | 1 - 4 files changed, 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 705c425c9c3d..47af5a7af358 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -225,8 +225,6 @@ extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); -extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); -extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); extern int native_ioapic_set_affinity(struct irq_data *, const struct cpumask *, bool); @@ -290,7 +288,6 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_write NULL #define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_io_apic_print_entries NULL #define native_ioapic_set_affinity NULL #define native_setup_ioapic_entry NULL #define native_eoi_ioapic_pin NULL diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1649bb9ca27c..2924bc88034a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -191,7 +191,6 @@ struct x86_io_apic_ops { void (*write) (unsigned int apic, unsigned int reg, unsigned int value); void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - void (*print_entries)(unsigned int apic, unsigned int nr_entries); int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, bool force); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3506e8aeba91..acb91c19f318 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1339,61 +1339,6 @@ static void __init setup_IO_APIC_irqs(void) } } -void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) -{ - int i; - - pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - pr_debug(" %02x %02X ", i, entry.dest); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector); - } -} - -void intel_ir_io_apic_print_entries(unsigned int apic, - unsigned int nr_entries) -{ - int i; - - pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IR_IO_APIC_route_entry *ir_entry; - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - ir_entry = (struct IR_IO_APIC_route_entry *)&entry; - - pr_debug(" %02x %04X ", i, ir_entry->index); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %X %02X\n", - ir_entry->format, - ir_entry->mask, - ir_entry->trigger, - ir_entry->irr, - ir_entry->polarity, - ir_entry->delivery_status, - ir_entry->index2, - ir_entry->zero, - ir_entry->vector); - } -} - void ioapic_zap_locks(void) { raw_spin_lock_init(&ioapic_lock); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index b094d691f2fe..d6f36c7594d7 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,7 +144,6 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .print_entries = native_io_apic_print_entries, .set_affinity = native_ioapic_set_affinity, .setup_entry = native_setup_ioapic_entry, .eoi_ioapic_pin = native_eoi_ioapic_pin, -- cgit v1.2.1 From 35d50d8fd5b8f932b3e71311a4cbd4384501ab9a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:41 +0800 Subject: x86/irq: Remove x86_io_apic_ops.setup_entry and related interfaces Now there is no user of x86_io_apic_ops.setup_entry anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-5-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 4 ---- arch/x86/include/asm/irq_remapping.h | 13 ------------- arch/x86/include/asm/x86_init.h | 3 --- arch/x86/kernel/apic/io_apic.c | 24 ------------------------ arch/x86/kernel/x86_init.c | 1 - 5 files changed, 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 47af5a7af358..d2a34e4718d2 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -150,9 +150,6 @@ struct irq_cfg; extern void ioapic_insert_resources(void); extern int arch_early_ioapic_init(void); -extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); extern void native_eoi_ioapic_pin(int apic, int pin, int vector); @@ -289,7 +286,6 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_modify NULL #define native_disable_io_apic NULL #define native_ioapic_set_affinity NULL -#define native_setup_ioapic_entry NULL #define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 52e1a1f337b2..4b5bbd104ae2 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -42,11 +42,6 @@ extern int irq_remapping_enable(void); extern void irq_remapping_disable(void); extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); -extern int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr); extern void free_remapped_irq(int irq); extern void panic_if_irq_remap(const char *msg); extern bool setup_remapped_irq(int irq, @@ -77,14 +72,6 @@ static inline int irq_remapping_enable(void) { return -ENODEV; } static inline void irq_remapping_disable(void) { } static inline int irq_remapping_reenable(int eim) { return -ENODEV; } static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } -static inline int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr) -{ - return -ENODEV; -} static inline void free_remapped_irq(int irq) { } static inline void panic_if_irq_remap(const char *msg) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2924bc88034a..0c690574efae 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -194,9 +194,6 @@ struct x86_io_apic_ops { int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, bool force); - int (*setup_entry)(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr); void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index acb91c19f318..cf5cd19b74e3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1296,30 +1296,6 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - memset(entry, 0, sizeof(*entry)); - - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = destination; - entry->vector = vector; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* - * Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d6f36c7594d7..066cdaa6503e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -145,6 +145,5 @@ struct x86_io_apic_ops x86_io_apic_ops = { .modify = native_io_apic_modify, .disable = native_disable_io_apic, .set_affinity = native_ioapic_set_affinity, - .setup_entry = native_setup_ioapic_entry, .eoi_ioapic_pin = native_eoi_ioapic_pin, }; -- cgit v1.2.1 From aa5cb97f14a2dd5aefabed6538c35ebc087d7c24 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:42 +0800 Subject: x86/irq: Remove x86_io_apic_ops.set_affinity and related interfaces Now there is no user of x86_io_apic_ops.set_affinity anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-6-git-send-email-jiang.liu@linux.intel.com --- arch/x86/include/asm/io_apic.h | 4 ---- arch/x86/include/asm/x86_init.h | 3 --- arch/x86/kernel/apic/io_apic.c | 25 +------------------------ arch/x86/kernel/x86_init.c | 1 - 4 files changed, 1 insertion(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index d2a34e4718d2..0ff68daa9949 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -222,9 +222,6 @@ extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); -extern int native_ioapic_set_affinity(struct irq_data *, - const struct cpumask *, - bool); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { @@ -285,7 +282,6 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_write NULL #define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_ioapic_set_affinity NULL #define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 0c690574efae..f9f83cfabcaa 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -191,9 +191,6 @@ struct x86_io_apic_ops { void (*write) (unsigned int apic, unsigned int reg, unsigned int value); void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - int (*set_affinity)(struct irq_data *data, - const struct cpumask *mask, - bool force); void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cf5cd19b74e3..9ef964512b86 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1787,29 +1787,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -int native_ioapic_set_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = apic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, irqd_cfg(data)); - ret = IRQ_SET_MASK_OK_NOCOPY; - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -2686,7 +2663,7 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - x86_io_apic_ops.set_affinity(idata, mask, false); + irq_set_affinity(irq, mask); } } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 066cdaa6503e..f7e8eab3a7c4 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,6 +144,5 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .set_affinity = native_ioapic_set_affinity, .eoi_ioapic_pin = native_eoi_ioapic_pin, }; -- cgit v1.2.1 From ad66e1efc95e548598b032c1fe5bbc34f6460547 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:43 +0800 Subject: x86/irq: Remove x86_io_apic_ops.eoi_ioapic_pin and related interfaces Now there is no user of x86_io_apic_ops.eoi_ioapic_pin anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-7-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 7 ------- arch/x86/include/asm/x86_init.h | 1 - arch/x86/kernel/apic/io_apic.c | 20 ++++---------------- arch/x86/kernel/x86_init.c | 1 - 4 files changed, 4 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 0ff68daa9949..fa4b25ebd658 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -150,10 +150,6 @@ struct irq_cfg; extern void ioapic_insert_resources(void); extern int arch_early_ioapic_init(void); -extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); - -extern void native_eoi_ioapic_pin(int apic, int pin, int vector); - extern int save_ioapic_entries(void); extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); @@ -237,8 +233,6 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned x86_io_apic_ops.modify(apic, reg, value); } -extern void io_apic_eoi(unsigned int apic, unsigned int vector); - extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); @@ -282,7 +276,6 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_write NULL #define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } static inline void enable_IO_APIC(void) { } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f9f83cfabcaa..4ada3d3a0e86 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -191,7 +191,6 @@ struct x86_io_apic_ops { void (*write) (unsigned int apic, unsigned int reg, unsigned int value); void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ef964512b86..998fefad820e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -271,7 +271,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } -void io_apic_eoi(unsigned int apic, unsigned int vector) +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(vector, &io_apic->eoi); @@ -527,7 +527,7 @@ static void unmask_ioapic_irq(struct irq_data *data) * Otherwise, we simulate the EOI message manually by changing the trigger * mode to edge and then back to level, with RTE being masked during this. */ -void native_eoi_ioapic_pin(int apic, int pin, int vector) +static void __eoi_ioapic_pin(int apic, int pin, int vector) { if (mpc_ioapic_ver(apic) >= 0x20) { io_apic_eoi(apic, vector); @@ -558,19 +558,7 @@ void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) - native_eoi_ioapic_pin(entry->apic, entry->pin, vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); -} - -void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) - x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin, - cfg->vector); + __eoi_ioapic_pin(entry->apic, entry->pin, vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -606,7 +594,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_write_entry(apic, pin, entry); } raw_spin_lock_irqsave(&ioapic_lock, flags); - native_eoi_ioapic_pin(apic, pin, entry.vector); + __eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f7e8eab3a7c4..f612dc018fb6 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,5 +144,4 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .eoi_ioapic_pin = native_eoi_ioapic_pin, }; -- cgit v1.2.1 From baac16952635445addaf397bad74e847db821d6d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:44 +0800 Subject: x86/irq: Remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ There's no user of irq_alloc_hwirqs(), irq_alloc_hwirq(), irq_free_hwirqs() and irq_free_hwirq() in x86 anymore, so remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ and related code. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-8-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 - arch/x86/kernel/apic/vector.c | 34 ---------------------------------- 2 files changed, 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 26e066579637..c86fdc13615f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -912,7 +912,6 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI - select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ select IRQ_DOMAIN_HIERARCHY select PCI_MSI_IRQ_DOMAIN if PCI_MSI diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 633f03268d48..d0e5ea0fb947 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -659,40 +659,6 @@ void irq_force_complete_move(int irq) } #endif -/* - * Dynamic irq allocate and deallocation. Should be replaced by irq domains! - */ -int arch_setup_hwirq(unsigned int irq, int node) -{ - struct irq_cfg *cfg; - unsigned long flags; - int ret; - - cfg = alloc_irq_cfg(node); - if (!cfg) - return -ENOMEM; - - raw_spin_lock_irqsave(&vector_lock, flags); - ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); - raw_spin_unlock_irqrestore(&vector_lock, flags); - - if (!ret) - irq_set_chip_data(irq, cfg); - else - free_irq_cfg(cfg); - return ret; -} - -void arch_teardown_hwirq(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - free_remapped_irq(irq); - clear_irq_vector(irq, cfg); - irq_set_chip_data(irq, NULL); - free_irq_cfg(cfg); -} - static void __init print_APIC_field(int base) { int i; -- cgit v1.2.1 From 3dd786ea3a0753bb19a5fd5103739a7cb9ec92c1 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:45 +0800 Subject: x86/irq: Clean up unused forward declarations in x86_init.h Clean up unused forward declarations in x86_init.h. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Yijing Wang Link: http://lkml.kernel.org/r/1428978610-28986-9-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 4ada3d3a0e86..09d4dab9302f 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -171,7 +171,6 @@ struct x86_platform_ops { }; struct pci_dev; -struct msi_msg; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); @@ -180,11 +179,6 @@ struct x86_msi_ops { void (*restore_msi_irqs)(struct pci_dev *dev); }; -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_data; -struct cpumask; - struct x86_io_apic_ops { void (*init) (void); unsigned int (*read) (unsigned int apic, unsigned int reg); -- cgit v1.2.1 From 9880534989ba96faad26aebc01dcdb2c1b5793aa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:46 +0800 Subject: irq_remapping: Clean up unsued code to support IOAPIC Now we have converted to hierarchical irqdomains, so clean up unused code. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-10-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 23 ----------------------- arch/x86/kernel/apic/vector.c | 1 - 2 files changed, 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 4b5bbd104ae2..09efa358c831 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -26,12 +26,7 @@ #include #include -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_chip; struct msi_msg; -struct pci_dev; -struct irq_cfg; struct irq_alloc_info; #ifdef CONFIG_IRQ_REMAP @@ -42,13 +37,7 @@ extern int irq_remapping_enable(void); extern void irq_remapping_disable(void); extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); -extern void free_remapped_irq(int irq); extern void panic_if_irq_remap(const char *msg); -extern bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip); - -void irq_remap_modify_chip_defaults(struct irq_chip *chip); extern struct irq_domain * irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info); @@ -72,23 +61,11 @@ static inline int irq_remapping_enable(void) { return -ENODEV; } static inline void irq_remapping_disable(void) { } static inline int irq_remapping_reenable(int eim) { return -ENODEV; } static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } -static inline void free_remapped_irq(int irq) { } static inline void panic_if_irq_remap(const char *msg) { } -static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) -{ -} - -static inline bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip) -{ - return false; -} - static inline struct irq_domain * irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info) { diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index d0e5ea0fb947..37bb9e82b919 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -283,7 +283,6 @@ static void x86_vector_free_irqs(struct irq_domain *domain, for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); if (irq_data && irq_data->chip_data) { - free_remapped_irq(virq); clear_irq_vector(virq + i, irq_data->chip_data); free_irq_cfg(irq_data->chip_data); #ifdef CONFIG_X86_IO_APIC -- cgit v1.2.1 From bac4f90784efb858cfafdd7401dede6ef9563818 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:50 +0800 Subject: x86/irq: Remove irq_cfg.irq_remapped Now there is no user of irq_cfg.irq_remapped, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-14-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 75d0db1db8a0..86e4698a0025 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -193,7 +193,6 @@ struct irq_cfg { u8 vector; u8 move_in_progress : 1; #ifdef CONFIG_IRQ_REMAP - u8 remapped : 1; union { struct irq_2_iommu irq_2_iommu; struct irq_2_irte irq_2_irte; -- cgit v1.2.1 From 099c5c03487f6bca30c628e14e666788dd61fb33 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:51 +0800 Subject: irq_remapping/vt-d: Move struct irq_2_iommu into intel_irq_remapping.c Now only intel_irq_remapping.c access irq_2_iommu, so move it from hw_irq.h into intel_irq_remapping.c. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-15-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 86e4698a0025..1e0ee1029ef6 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -95,14 +95,6 @@ extern void trace_call_function_single_interrupt(void); #endif /* CONFIG_TRACING */ #ifdef CONFIG_IRQ_REMAP -/* Intel specific interrupt remapping information */ -struct irq_2_iommu { - struct intel_iommu *iommu; - u16 irte_index; - u16 sub_handle; - u8 irte_mask; -}; - /* AMD specific interrupt remapping information */ struct irq_2_irte { u16 devid; /* Device ID for IRTE table */ @@ -194,7 +186,6 @@ struct irq_cfg { u8 move_in_progress : 1; #ifdef CONFIG_IRQ_REMAP union { - struct irq_2_iommu irq_2_iommu; struct irq_2_irte irq_2_irte; }; #endif -- cgit v1.2.1 From 9c72496698a4dadd406d159f7735851a63ef9412 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:52 +0800 Subject: irq_remapping/amd: Move struct irq_2_irte into amd_iommu.c Now only amd_iommu.c access irq_2_irte, so move it from hw_irq.h into amd_iommu.c. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-16-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 1e0ee1029ef6..e47bc4de5630 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -94,14 +94,6 @@ extern void trace_call_function_single_interrupt(void); #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi #endif /* CONFIG_TRACING */ -#ifdef CONFIG_IRQ_REMAP -/* AMD specific interrupt remapping information */ -struct irq_2_irte { - u16 devid; /* Device ID for IRTE table */ - u16 index; /* Index into IRTE table*/ -}; -#endif /* CONFIG_IRQ_REMAP */ - struct irq_domain; #ifdef CONFIG_X86_LOCAL_APIC @@ -184,11 +176,6 @@ struct irq_cfg { unsigned int dest_apicid; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_IRQ_REMAP - union { - struct irq_2_irte irq_2_irte; - }; -#endif union { #ifdef CONFIG_X86_IO_APIC struct { -- cgit v1.2.1 From 4467715a44cca2fa41d25f3d32b737bd2331a8d9 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:53 +0800 Subject: x86/irq: Move irq_cfg.irq_2_pin into io_apic.c Now only io_apic.c accesses struct irq_cfg.irq_2_pin, so move irq_2_pin into struct mp_chip_data in io_apic.c to clean up struct irq_cfg further. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-17-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 7 -- arch/x86/kernel/apic/io_apic.c | 164 +++++++++++++++++++---------------------- arch/x86/kernel/apic/vector.c | 3 - 3 files changed, 77 insertions(+), 97 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e47bc4de5630..f000b58cbc0c 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -176,13 +176,6 @@ struct irq_cfg { unsigned int dest_apicid; u8 vector; u8 move_in_progress : 1; - union { -#ifdef CONFIG_X86_IO_APIC - struct { - struct list_head irq_2_pin; - }; -#endif - }; }; extern struct irq_domain *x86_vector_domain; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 998fefad820e..a1abdcf2cb5f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -78,7 +78,13 @@ static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; +struct irq_pin_list { + struct list_head list; + int apic, pin; +}; + struct mp_chip_data { + struct list_head irq_2_pin; struct IO_APIC_route_entry entry; int trigger; int polarity; @@ -215,16 +221,6 @@ void mp_save_irq(struct mpc_intsrc *m) panic("Max # of irq sources exceeded!!\n"); } -struct irq_pin_list { - struct list_head list; - int apic, pin; -}; - -static struct irq_pin_list *alloc_irq_pin_list(int node) -{ - return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); -} - static void alloc_ioapic_saved_registers(int idx) { size_t size; @@ -379,16 +375,17 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { struct irq_pin_list *entry; /* don't allow duplicates */ - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) if (entry->apic == apic && entry->pin == pin) return 0; - entry = alloc_irq_pin_list(node); + entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); if (!entry) { pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", node, apic, pin); @@ -396,16 +393,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi } entry->apic = apic; entry->pin = pin; + list_add_tail(&entry->list, &data->irq_2_pin); - list_add_tail(&entry->list, &cfg->irq_2_pin); return 0; } -static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) +static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) { struct irq_pin_list *tmp, *entry; - list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list) + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) if (entry->apic == apic && entry->pin == pin) { list_del(&entry->list); kfree(entry); @@ -413,22 +410,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) } } -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static void add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { - if (__add_pin_to_irq_node(cfg, node, apic, pin)) + if (__add_pin_to_irq_node(data, node, apic, pin)) panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, int oldapic, int oldpin, int newapic, int newpin) { struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -438,7 +436,7 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, } /* old apic/pin didn't exist, so just add new ones */ - add_pin_to_irq_node(cfg, node, newapic, newpin); + add_pin_to_irq_node(data, node, newapic, newpin); } static void __io_apic_modify_irq(struct irq_pin_list *entry, @@ -456,13 +454,13 @@ static void __io_apic_modify_irq(struct irq_pin_list *entry, final(entry); } -static void io_apic_modify_irq(struct irq_cfg *cfg, +static void io_apic_modify_irq(struct mp_chip_data *data, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __io_apic_modify_irq(entry, mask_and, mask_or, final); } @@ -478,39 +476,31 @@ static void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void mask_ioapic(struct irq_cfg *cfg) +static void mask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_ioapic_irq(struct irq_data *data) -{ - mask_ioapic(irqd_cfg(data)); -} - -static void __unmask_ioapic(struct irq_cfg *cfg) +static void __unmask_ioapic(struct mp_chip_data *data) { - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); } -static void unmask_ioapic(struct irq_cfg *cfg) +static void unmask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - __unmask_ioapic(cfg); + __unmask_ioapic(data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_ioapic_irq(struct irq_data *data) -{ - unmask_ioapic(irqd_cfg(data)); -} - /* * IO-APIC versions below 0x20 don't support EOI register. * For the record, here is the information about various versions: @@ -551,13 +541,13 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) } } -void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) +void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { unsigned long flags; struct irq_pin_list *entry; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __eoi_ioapic_pin(entry->apic, entry->pin, vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1068,11 +1058,10 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, * entry. The IOAPIC entry */ if (irq_data && irq_data->parent_data) { - struct irq_cfg *cfg = irqd_cfg(irq_data); - if (!mp_check_pin_attr(irq, info)) return -EBUSY; - if (__add_pin_to_irq_node(cfg, node, ioapic, info->ioapic_pin)) + if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, + info->ioapic_pin)) return -ENOMEM; } else { irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); @@ -1394,9 +1383,7 @@ static void __init print_IO_APIC(int ioapic_idx) void __init print_IO_APICs(void) { int ioapic_idx; - struct irq_cfg *cfg; unsigned int irq; - struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for_each_ioapic(ioapic_idx) @@ -1416,18 +1403,20 @@ void __init print_IO_APICs(void) printk(KERN_DEBUG "IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; + struct irq_chip *chip; + struct mp_chip_data *data; chip = irq_get_chip(irq); if (chip != &ioapic_chip && chip != &ioapic_ir_chip) continue; - - cfg = irq_cfg(irq); - if (!cfg) + data = irq_get_chip_data(irq); + if (!data) continue; - if (list_empty(&cfg->irq_2_pin)) + if (list_empty(&data->irq_2_pin)) continue; + printk(KERN_DEBUG "IRQ%d ", irq); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) pr_cont("-> %d:%d", entry->apic, entry->pin); pr_cont("\n"); } @@ -1740,7 +1729,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) if (legacy_pic->irq_pending(irq)) was_pending = 1; } - __unmask_ioapic(irqd_cfg(data)); + __unmask_ioapic(data->chip_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -1755,13 +1744,15 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) * races. */ -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, + struct mp_chip_data *data) { int apic, pin; struct irq_pin_list *entry; u8 vector = cfg->vector; + unsigned int dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { unsigned int reg; apic = entry->apic; @@ -1778,13 +1769,13 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +static bool io_apic_level_ack_pending(struct mp_chip_data *data) { struct irq_pin_list *entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { unsigned int reg; int pin; @@ -1801,18 +1792,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic(cfg); + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { if (unlikely(masked)) { /* Only migrate the irq if the ack has been received. @@ -1841,31 +1831,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(cfg)) + if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic(cfg); + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { } #endif -static void ioapic_ack_level(struct irq_data *data) +static void ioapic_ack_level(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); + struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; bool masked; int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(data, cfg); + masked = ioapic_irqd_mask(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -1917,10 +1906,10 @@ static void ioapic_ack_level(struct irq_data *data) */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - eoi_ioapic_pin(cfg->vector, cfg); + eoi_ioapic_pin(cfg->vector, irq_data->chip_data); } - ioapic_irqd_unmask(data, cfg, masked); + ioapic_irqd_unmask(irq_data, masked); } static void ioapic_ir_ack_level(struct irq_data *irq_data) @@ -1934,7 +1923,7 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) * EOI we use the pin number. */ ack_APIC_irq(); - eoi_ioapic_pin(data->entry.vector, irqd_cfg(irq_data)); + eoi_ioapic_pin(data->entry.vector, data); } static int ioapic_set_affinity(struct irq_data *irq_data, @@ -1942,7 +1931,6 @@ static int ioapic_set_affinity(struct irq_data *irq_data, { struct irq_data *parent = irq_data->parent_data; struct mp_chip_data *data = irq_data->chip_data; - unsigned int dest, irq = irq_data->irq; struct irq_cfg *cfg; unsigned long flags; int ret; @@ -1953,9 +1941,7 @@ static int ioapic_set_affinity(struct irq_data *irq_data, cfg = irqd_cfg(irq_data); data->entry.dest = cfg->dest_apicid; data->entry.vector = cfg->vector; - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - __target_IO_APIC_irq(irq, dest, cfg); + __target_IO_APIC_irq(irq_data->irq, cfg, irq_data->chip_data); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2116,10 +2102,11 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup); static int mp_alloc_timer_irq(int ioapic, int pin) { int irq = -1; - struct irq_alloc_info info; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); if (domain) { + struct irq_alloc_info info; + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); info.ioapic_id = mpc_ioapic_id(ioapic); info.ioapic_pin = pin; @@ -2141,7 +2128,9 @@ static int mp_alloc_timer_irq(int ioapic, int pin) */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_data *irq_data = irq_get_irq_data(0); + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2205,9 +2194,9 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic(cfg); + unmask_ioapic_irq(irq_get_chip_data(0)); } - irq_domain_activate_irq(irq_get_irq_data(0)); + irq_domain_activate_irq(irq_data); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2227,8 +2216,8 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); - irq_domain_activate_irq(irq_get_irq_data(0)); + replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_activate_irq(irq_data); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -3044,6 +3033,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return ret; } + INIT_LIST_HEAD(&data->irq_2_pin); irq_data->hwirq = info->ioapic_pin; irq_data->chip = (domain->parent == x86_vector_domain) ? &ioapic_chip : &ioapic_ir_chip; @@ -3051,7 +3041,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); cfg = irqd_cfg(irq_data); - add_pin_to_irq_node(cfg, info->ioapic_node, ioapic, pin); + add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); if (info->ioapic_entry) mp_setup_entry(cfg, data, info->ioapic_entry); mp_register_handler(virq, data->trigger); @@ -3069,15 +3059,16 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { - struct irq_cfg *cfg = irq_cfg(virq); struct irq_data *irq_data; + struct mp_chip_data *data; BUG_ON(nr_irqs != 1); irq_data = irq_domain_get_irq_data(domain, virq); if (irq_data && irq_data->chip_data) { - __remove_pin_from_irq(cfg, mp_irqdomain_ioapic_idx(domain), + data = irq_data->chip_data; + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); - WARN_ON(!list_empty(&cfg->irq_2_pin)); + WARN_ON(!list_empty(&data->irq_2_pin)); kfree(irq_data->chip_data); } irq_domain_free_irqs_top(domain, virq, nr_irqs); @@ -3089,10 +3080,9 @@ void mp_irqdomain_activate(struct irq_domain *domain, unsigned long flags; struct irq_pin_list *entry; struct mp_chip_data *data = irq_data->chip_data; - struct irq_cfg *cfg = irqd_cfg(irq_data); raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __ioapic_write_entry(entry->apic, entry->pin, data->entry); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 37bb9e82b919..af224e6774d8 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -68,9 +68,6 @@ static struct irq_cfg *alloc_irq_cfg(int node) goto out_cfg; if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) goto out_domain; -#ifdef CONFIG_X86_IO_APIC - INIT_LIST_HEAD(&cfg->irq_2_pin); -#endif return cfg; out_domain: free_cpumask_var(cfg->domain); -- cgit v1.2.1 From 50a6ad84b2a2c971e76d57884d61a5a55d7c1601 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:54 +0800 Subject: x86/irq: Remove struct io_apic_irq_attr Now there's no user of struct io_apic_irq_attr anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-18-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 7 ------- arch/x86/kernel/apic/io_apic.c | 10 ---------- 2 files changed, 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index fa4b25ebd658..4eb4bcc5f219 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -157,13 +157,6 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); -struct io_apic_irq_attr { - int ioapic; - int ioapic_pin; - int trigger; - int polarity; -}; - enum ioapic_domain_type { IOAPIC_DOMAIN_INVALID, IOAPIC_DOMAIN_LEGACY, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a1abdcf2cb5f..76dc9f5bfdbc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2959,16 +2959,6 @@ int mp_ioapic_registered(u32 gsi_base) return 0; } -static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, - int ioapic, int ioapic_pin, - int trigger, int polarity) -{ - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; -} - static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { -- cgit v1.2.1 From 9a93d4736ec5ec322ec8f240a292c1a86cd0876d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:55 +0800 Subject: x86/irq: Remove x86_io_apic_ops.write and x86_io_apic_ops.modify x86_io_apic_ops.write is always set to native_io_apic_write(), and nobody overrides it. So get rid of the indirection by changing native_io_apic_write() as io_apic_write() and removing x86_io_apic_ops.write. Do the same for x86_io_apic_ops.modify and native_io_apic_modify(). Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-19-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 13 ------------- arch/x86/include/asm/x86_init.h | 2 -- arch/x86/kernel/apic/io_apic.c | 6 ++++-- arch/x86/kernel/x86_init.c | 2 -- 4 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 4eb4bcc5f219..c6486dd44418 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -208,8 +208,6 @@ extern void disable_ioapic_support(void); extern void __init native_io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); -extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); -extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) @@ -217,15 +215,6 @@ static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) return x86_io_apic_ops.read(apic, reg); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.write(apic, reg, value); -} -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.modify(apic, reg, value); -} - extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); @@ -266,8 +255,6 @@ static inline void mp_save_irq(struct mpc_intsrc *m) { }; static inline void disable_ioapic_support(void) { } #define native_io_apic_init_mappings NULL #define native_io_apic_read NULL -#define native_io_apic_write NULL -#define native_io_apic_modify NULL #define native_disable_io_apic NULL static inline void setup_IO_APIC(void) { } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 09d4dab9302f..844b37d55a44 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -182,8 +182,6 @@ struct x86_msi_ops { struct x86_io_apic_ops { void (*init) (void); unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*write) (unsigned int apic, unsigned int reg, unsigned int value); - void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 76dc9f5bfdbc..d687a10ed3a2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -280,7 +280,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) return readl(&io_apic->data); } -void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static void io_apic_write(unsigned int apic, unsigned int reg, + unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -294,7 +295,8 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu * * Older SiS APIC requires we rewrite the index register */ -void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +static void io_apic_modify(unsigned int apic, unsigned int reg, + unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f612dc018fb6..633f07845099 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -141,7 +141,5 @@ void arch_restore_msi_irqs(struct pci_dev *dev) struct x86_io_apic_ops x86_io_apic_ops = { .init = native_io_apic_init_mappings, .read = native_io_apic_read, - .write = native_io_apic_write, - .modify = native_io_apic_modify, .disable = native_disable_io_apic, }; -- cgit v1.2.1 From ca1b88622e9c16df7b1e0a57e9c6c2300321bed4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Apr 2015 13:57:48 +0200 Subject: x86: Remove more unmodified io_apic_ops io_apic_ops.init() is either NULL, if IO-APIC support is disabled at compile time or native_io_apic_init_mappings(). No point to have that as we can achieve the same thing with an empty inline. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 6 +++--- arch/x86/include/asm/x86_init.h | 1 - arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/setup.c | 3 +-- arch/x86/kernel/x86_init.c | 1 - 5 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index c6486dd44418..f80f4efa1717 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -206,7 +206,7 @@ extern void mp_save_irq(struct mpc_intsrc *m); extern void disable_ioapic_support(void); -extern void __init native_io_apic_init_mappings(void); +extern void __init io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); extern void native_disable_io_apic(void); @@ -251,9 +251,9 @@ static inline int restore_ioapic_entries(void) return -ENOMEM; } -static inline void mp_save_irq(struct mpc_intsrc *m) { }; +static inline void mp_save_irq(struct mpc_intsrc *m) { } static inline void disable_ioapic_support(void) { } -#define native_io_apic_init_mappings NULL +static inline void io_apic_init_mappings(void) { } #define native_io_apic_read NULL #define native_disable_io_apic NULL diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 844b37d55a44..48d34d28f5a6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -180,7 +180,6 @@ struct x86_msi_ops { }; struct x86_io_apic_ops { - void (*init) (void); unsigned int (*read) (unsigned int apic, unsigned int reg); void (*disable)(void); }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d687a10ed3a2..3029502b0a50 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2687,7 +2687,7 @@ static struct resource * __init ioapic_setup_resources(void) return res; } -void __init native_io_apic_init_mappings(void) +void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d74ac33290ae..8d04a7594a03 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1222,8 +1222,7 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - if (x86_io_apic_ops.init) - x86_io_apic_ops.init(); + io_apic_init_mappings(); kvm_guest_init(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 633f07845099..3cee10abf01d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -139,7 +139,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) #endif struct x86_io_apic_ops x86_io_apic_ops = { - .init = native_io_apic_init_mappings, .read = native_io_apic_read, .disable = native_disable_io_apic, }; -- cgit v1.2.1 From 154d9e50e413ee144d48ccd6c402633ffbecbfff Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:56 +0800 Subject: x86/irq: Clean up io_apic.h Clean up io_apic.h by: 1) moving definition of struct mp_ioapic_gsi into io_apic.c 2) changing mp_pin_to_gsi() and mp_ioapic_gsi_routing() as static 3) removing unused MP_MAX_IOAPIC_PIN 4) removing useless forward declaration 5) removing useless comments Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-20-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 13 ++----------- arch/x86/kernel/apic/io_apic.c | 23 ++++++++--------------- 2 files changed, 10 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index f80f4efa1717..9b9050129ee7 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -113,9 +113,6 @@ extern int nr_ioapics; extern int mpc_ioapic_id(int ioapic); extern unsigned int mpc_ioapic_addr(int ioapic); -extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic); - -#define MP_MAX_IOAPIC_PIN 127 /* # of MP IRQ source entries */ extern int mp_irq_entries; @@ -135,6 +132,8 @@ extern int noioapicquirk; /* -1 if "noapic" boot option passed */ extern int noioapicreroute; +extern u32 gsi_top; + extern unsigned long io_apic_irqs; #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs)) @@ -174,15 +173,8 @@ struct ioapic_domain_cfg { struct device_node *dev; }; -struct mp_ioapic_gsi{ - u32 gsi_base; - u32 gsi_end; -}; -extern u32 gsi_top; - extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); -extern u32 mp_pin_to_gsi(int ioapic, int pin); extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info); extern void mp_unmap_irq(int irq); @@ -231,7 +223,6 @@ static inline int arch_early_ioapic_init(void) { return 0; } static inline void print_IO_APICs(void) {} #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } -static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3029502b0a50..4c7da8483398 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -63,7 +63,6 @@ #define for_each_ioapic_pin(idx, pin) \ for_each_ioapic((idx)) \ for_each_pin((idx), (pin)) - #define for_each_irq_pin(entry, head) \ list_for_each_entry(entry, &head, list) @@ -92,6 +91,11 @@ struct mp_chip_data { bool isa_irq; }; +struct mp_ioapic_gsi { + u32 gsi_base; + u32 gsi_end; +}; + static struct ioapic { /* * # of IRQ routing registers @@ -122,7 +126,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx) return ioapics[ioapic_idx].mp_config.apicaddr; } -struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) { return &ioapics[ioapic_idx].gsi_config; } @@ -134,7 +138,7 @@ static inline int mp_ioapic_pin_count(int ioapic) return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; } -u32 mp_pin_to_gsi(int ioapic, int pin) +static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; } @@ -1153,8 +1157,7 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); } -int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, - struct irq_alloc_info *info) +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) { int ioapic, pin, idx; @@ -1719,7 +1722,6 @@ static int __init timer_irq_works(void) * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... */ - static unsigned int startup_ioapic_irq(struct irq_data *data) { int was_pending = 0, irq = data->irq; @@ -1737,15 +1739,6 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) return was_pending; } -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, struct mp_chip_data *data) { -- cgit v1.2.1 From 0be275e3a5607b23f5132121bca22a10ee23aa99 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:57 +0800 Subject: x86/irq: Use cached IOAPIC entry instead of reading from hardware Use cached IOAPIC entry instead of reading data from IOAPIC hardware registers to improve performance. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-21-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 78 ++++++++++++------------------------------ 1 file changed, 21 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4c7da8483398..4fb347f01653 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -67,8 +67,13 @@ list_for_each_entry(entry, &head, list) /* - * Is the SiS APIC rmw bug present ? + * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes + * When doing a read-modify-write operation on IOAPIC registers, older SiS APIC + * requires we rewrite the index register again where the read already set up + * the index register. + * The code to make use of sis_apic_bug has been removed, but we don't want to + * lose this knowledge. */ int sis_apic_bug = -1; @@ -293,22 +298,6 @@ static void io_apic_write(unsigned int apic, unsigned int reg, writel(value, &io_apic->data); } -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static void io_apic_modify(unsigned int apic, unsigned int reg, - unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -445,29 +434,23 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, add_pin_to_irq_node(data, node, newapic, newpin); } -static void __io_apic_modify_irq(struct irq_pin_list *entry, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) -{ - unsigned int reg, pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); -} - static void io_apic_modify_irq(struct mp_chip_data *data, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { + union entry_union eu; struct irq_pin_list *entry; - for_each_irq_pin(entry, data->irq_2_pin) - __io_apic_modify_irq(entry, mask_and, mask_or, final); + eu.entry = data->entry; + eu.w1 &= mask_and; + eu.w1 |= mask_or; + data->entry = eu.entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + if (final) + final(entry); + } } static void io_apic_sync(struct irq_pin_list *entry) @@ -1739,28 +1722,6 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) return was_pending; } -static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, - struct mp_chip_data *data) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - unsigned int dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - - for_each_irq_pin(entry, data->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -1926,6 +1887,7 @@ static int ioapic_set_affinity(struct irq_data *irq_data, { struct irq_data *parent = irq_data->parent_data; struct mp_chip_data *data = irq_data->chip_data; + struct irq_pin_list *entry; struct irq_cfg *cfg; unsigned long flags; int ret; @@ -1936,7 +1898,9 @@ static int ioapic_set_affinity(struct irq_data *irq_data, cfg = irqd_cfg(irq_data); data->entry.dest = cfg->dest_apicid; data->entry.vector = cfg->vector; - __target_IO_APIC_irq(irq_data->irq, cfg, irq_data->chip_data); + for_each_irq_pin(entry, data->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, + data->entry); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.1 From 1f934641294ca2e09016c689862378fbb15da4d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:29:58 +0800 Subject: x86/irq: Remove sis apic bug workaround The SiS apic bug workaround is now obsolete as we cache the register values for performance reasons. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-22-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 3 --- arch/x86/kernel/apic/io_apic.c | 35 ++++++++++------------------------- 2 files changed, 10 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 9b9050129ee7..d58f1c6b5608 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -120,9 +120,6 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* Older SiS APIC requires we rewrite the index register */ -extern int sis_apic_bug; - /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4fb347f01653..9806f9605bc4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -18,6 +18,16 @@ * and Rolf G. Tews * for testing these extensively * Paul Diefenbaugh : Added full ACPI support + * + * Historical information which is worth to be preserved: + * + * - SiS APIC rmw bug: + * + * We used to have a workaround for a bug in SiS chips which + * required to rewrite the index register for a read-modify-write + * operation as the chip lost the index information which was + * setup for the read already. We cache the data now, so that + * workaround has been removed. */ #include @@ -66,17 +76,6 @@ #define for_each_irq_pin(entry, head) \ list_for_each_entry(entry, &head, list) -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - * When doing a read-modify-write operation on IOAPIC registers, older SiS APIC - * requires we rewrite the index register again where the read already set up - * the index register. - * The code to make use of sis_apic_bug has been removed, but we don't want to - * lose this knowledge. - */ -int sis_apic_bug = -1; - static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; @@ -2320,20 +2319,6 @@ void __init setup_IO_APIC(void) ioapic_initialized = 1; } -/* - * Called after all the initialization is done. If we didn't find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - static void resume_ioapic_id(int ioapic_idx) { unsigned long flags; -- cgit v1.2.1 From a2cbbb47fd90ef1161ce22b099de5c6095f8365f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:59 +0800 Subject: x86/irq: Remove unused alloc_irq_and_cfg_at() There's no caller of alloc_irq_and_cfg_at() anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-23-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 1 - arch/x86/kernel/apic/vector.c | 21 --------------------- 2 files changed, 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index f000b58cbc0c..bb2c990b97ef 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -186,7 +186,6 @@ extern void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src); extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); -extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index af224e6774d8..51cd46bfc46e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -76,27 +76,6 @@ out_cfg: return NULL; } -struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) -{ - int res = irq_alloc_desc_at(at, node); - struct irq_cfg *cfg; - - if (res < 0) { - if (res != -EEXIST) - return NULL; - cfg = irq_cfg(at); - if (cfg) - return cfg; - } - - cfg = alloc_irq_cfg(node); - if (cfg) - irq_set_chip_data(at, cfg); - else - irq_free_desc(at); - return cfg; -} - static void free_irq_cfg(struct irq_cfg *cfg) { if (cfg) { -- cgit v1.2.1 From f970510cc55e41d21ca30feb56873aaeb57ec18d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:00 +0800 Subject: x86/irq: Make functions only used in vector.c static Function {assign|clear}_irq_vector() and apic_retrigger_irq() are only used in vector.c, so make them static. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-24-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 3 --- arch/x86/kernel/apic/vector.c | 7 ++++--- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index bb2c990b97ef..9a797682fef5 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -188,8 +188,6 @@ extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); -extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); -extern void clear_irq_vector(int irq, struct irq_cfg *cfg); extern void setup_vector_irq(int cpu); #ifdef CONFIG_SMP extern void send_cleanup_vector(struct irq_cfg *); @@ -199,7 +197,6 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif -extern int apic_retrigger_irq(struct irq_data *data); extern void apic_ack_edge(struct irq_data *data); extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, unsigned int *dest_id); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 51cd46bfc46e..d52af4d805db 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -185,7 +185,8 @@ next: return err; } -int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int assign_irq_vector(int irq, struct irq_cfg *cfg, + const struct cpumask *mask) { int err; unsigned long flags; @@ -196,7 +197,7 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) return err; } -void clear_irq_vector(int irq, struct irq_cfg *cfg) +static void clear_irq_vector(int irq, struct irq_cfg *cfg) { int cpu, vector; unsigned long flags; @@ -441,7 +442,7 @@ void setup_vector_irq(int cpu) __setup_vector_irq(cpu); } -int apic_retrigger_irq(struct irq_data *data) +static int apic_retrigger_irq(struct irq_data *data) { struct irq_cfg *cfg = irqd_cfg(data); unsigned long flags; -- cgit v1.2.1 From 68f9f4404d74f859dc84973db8731b41a51d929a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:01 +0800 Subject: x86/irq: Remove function apic_set_affinity() Now there's no user of apic_set_affinity(), so remove it. Also rename vector_set_affinity() to apic_set_affinity() for consistency. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-25-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 2 -- arch/x86/kernel/apic/vector.c | 40 +++------------------------------------- 2 files changed, 3 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9a797682fef5..727c62378a65 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -198,8 +198,6 @@ static inline void irq_complete_move(struct irq_cfg *c) { } #endif extern void apic_ack_edge(struct irq_data *data); -extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id); #else /* CONFIG_X86_LOCAL_APIC */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index d52af4d805db..1aea62d60cf2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -463,42 +463,8 @@ void apic_ack_edge(struct irq_data *data) ack_APIC_irq(); } -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) -{ - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int irq = data->irq; - int err; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); - if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) - pr_err("Failed to recover vector for irq %d\n", irq); - return err; - } - - cpumask_copy(data->affinity, mask); - - return 0; -} - -static int vector_set_affinity(struct irq_data *irq_data, - const struct cpumask *dest, bool force) +static int apic_set_affinity(struct irq_data *irq_data, + const struct cpumask *dest, bool force) { struct irq_cfg *cfg = irq_data->chip_data; int err, irq = irq_data->irq; @@ -523,7 +489,7 @@ static int vector_set_affinity(struct irq_data *irq_data, static struct irq_chip lapic_controller = { .irq_ack = apic_ack_edge, - .irq_set_affinity = vector_set_affinity, + .irq_set_affinity = apic_set_affinity, .irq_retrigger = apic_retrigger_irq, }; -- cgit v1.2.1 From c6c2002b744215810c770dd73f45da954bcfa9d5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:02 +0800 Subject: x86/irq: Move check of cfg->move_in_progress into send_cleanup_vector() Move check of cfg->move_in_progress into send_cleanup_vector() to prepare for simplifying struct irq_cfg. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-26-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 10 ++++++++-- arch/x86/platform/uv/uv_irq.c | 3 +-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1aea62d60cf2..0092a6e0d5ee 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -494,7 +494,7 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -void send_cleanup_vector(struct irq_cfg *cfg) +static void __send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; @@ -512,6 +512,12 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } +void send_cleanup_vector(struct irq_cfg *cfg) +{ + if (cfg->move_in_progress) + __send_cleanup_vector(cfg); +} + asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -582,7 +588,7 @@ static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - send_cleanup_vector(cfg); + __send_cleanup_vector(cfg); } void irq_complete_move(struct irq_cfg *cfg) diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 54af6e388a12..091b36ac44c4 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -63,8 +63,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0) { uv_program_mmr(cfg, data->chip_data); - if (cfg->move_in_progress) - send_cleanup_vector(cfg); + send_cleanup_vector(cfg); } return ret; -- cgit v1.2.1 From 7f3262edcdf623296b514377d52911b115c7ab49 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:03 +0800 Subject: x86/irq: Move private data in struct irq_cfg into dedicated data structure Several fields in struct irq_cfg are private to vector.c, so move it into dedicated data structure. This helps to hide implementation details. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-27-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1416901802-24211-35-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Tested-by: Joerg Roedel --- arch/x86/include/asm/hw_irq.h | 3 - arch/x86/kernel/apic/vector.c | 221 +++++++++++++++++++++++------------------- 2 files changed, 119 insertions(+), 105 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 727c62378a65..3b8233a26348 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -171,11 +171,8 @@ enum { }; struct irq_cfg { - cpumask_var_t domain; - cpumask_var_t old_domain; unsigned int dest_apicid; u8 vector; - u8 move_in_progress : 1; }; extern struct irq_domain *x86_vector_domain; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 0092a6e0d5ee..60047495041c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -21,11 +21,18 @@ #include #include +struct apic_chip_data { + struct irq_cfg cfg; + cpumask_var_t domain; + cpumask_var_t old_domain; + u8 move_in_progress : 1; +}; + struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); static struct irq_chip lapic_controller; #ifdef CONFIG_X86_IO_APIC -static struct irq_cfg *legacy_irq_cfgs[NR_IRQS_LEGACY]; +static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; #endif void lock_vector_lock(void) @@ -41,12 +48,7 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } -struct irq_cfg *irq_cfg(unsigned int irq) -{ - return irqd_cfg(irq_get_irq_data(irq)); -} - -struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data) { if (!irq_data) return NULL; @@ -57,36 +59,48 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data) return irq_data->chip_data; } -static struct irq_cfg *alloc_irq_cfg(int node) +struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +{ + struct apic_chip_data *data = apic_chip_data(irq_data); + + return data ? &data->cfg : NULL; +} + +struct irq_cfg *irq_cfg(unsigned int irq) { - struct irq_cfg *cfg; + return irqd_cfg(irq_get_irq_data(irq)); +} - cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); - if (!cfg) +static struct apic_chip_data *alloc_apic_chip_data(int node) +{ + struct apic_chip_data *data; + + data = kzalloc_node(sizeof(*data), GFP_KERNEL, node); + if (!data) return NULL; - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) - goto out_cfg; - if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node)) + goto out_data; + if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node)) goto out_domain; - return cfg; + return data; out_domain: - free_cpumask_var(cfg->domain); -out_cfg: - kfree(cfg); + free_cpumask_var(data->domain); +out_data: + kfree(data); return NULL; } -static void free_irq_cfg(struct irq_cfg *cfg) +static void free_apic_chip_data(struct apic_chip_data *data) { - if (cfg) { - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); + if (data) { + free_cpumask_var(data->domain); + free_cpumask_var(data->old_domain); + kfree(data); } } -static int -__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int __assign_irq_vector(int irq, struct apic_chip_data *d, + const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@ -104,7 +118,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int cpu, err; cpumask_var_t tmp_mask; - if (cfg->move_in_progress) + if (d->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -112,26 +126,26 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - cpumask_clear(cfg->old_domain); + cpumask_clear(d->old_domain); cpu = cpumask_first_and(mask, cpu_online_mask); while (cpu < nr_cpu_ids) { int new_cpu, vector, offset; apic->vector_allocation_domain(cpu, tmp_mask, mask); - if (cpumask_subset(tmp_mask, cfg->domain)) { + if (cpumask_subset(tmp_mask, d->domain)) { err = 0; - if (cpumask_equal(tmp_mask, cfg->domain)) + if (cpumask_equal(tmp_mask, d->domain)) break; /* * New cpumask using the vector is a proper subset of * the current in use mask. So cleanup the vector * allocation for the members that are not used anymore. */ - cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); - cpumask_and(cfg->domain, cfg->domain, tmp_mask); + cpumask_andnot(d->old_domain, d->domain, tmp_mask); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); + cpumask_and(d->domain, d->domain, tmp_mask); break; } @@ -145,8 +159,8 @@ next: } if (unlikely(current_vector == vector)) { - cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpumask_or(d->old_domain, d->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, d->old_domain); cpu = cpumask_first_and(tmp_mask, cpu_online_mask); continue; } @@ -162,15 +176,15 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (cfg->vector) { - cpumask_copy(cfg->old_domain, cfg->domain); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); + if (d->cfg.vector) { + cpumask_copy(d->old_domain, d->domain); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cpumask_copy(cfg->domain, tmp_mask); + d->cfg.vector = vector; + cpumask_copy(d->domain, tmp_mask); err = 0; break; } @@ -178,46 +192,46 @@ next: if (!err) { /* cache destination APIC IDs into cfg->dest_apicid */ - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, - &cfg->dest_apicid); + err = apic->cpu_mask_to_apicid_and(mask, d->domain, + &d->cfg.dest_apicid); } return err; } -static int assign_irq_vector(int irq, struct irq_cfg *cfg, +static int assign_irq_vector(int irq, struct apic_chip_data *data, const struct cpumask *mask) { int err; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, cfg, mask); + err = __assign_irq_vector(irq, data, mask); raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } -static void clear_irq_vector(int irq, struct irq_cfg *cfg) +static void clear_irq_vector(int irq, struct apic_chip_data *data) { int cpu, vector; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - BUG_ON(!cfg->vector); + BUG_ON(!data->cfg.vector); - vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + vector = data->cfg.vector; + for_each_cpu_and(cpu, data->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; - cfg->vector = 0; - cpumask_clear(cfg->domain); + data->cfg.vector = 0; + cpumask_clear(data->domain); - if (likely(!cfg->move_in_progress)) { + if (likely(!data->move_in_progress)) { raw_spin_unlock_irqrestore(&vector_lock, flags); return; } - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -226,7 +240,7 @@ static void clear_irq_vector(int irq, struct irq_cfg *cfg) break; } } - cfg->move_in_progress = 0; + data->move_in_progress = 0; raw_spin_unlock_irqrestore(&vector_lock, flags); } @@ -261,10 +275,10 @@ static void x86_vector_free_irqs(struct irq_domain *domain, irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); if (irq_data && irq_data->chip_data) { clear_irq_vector(virq + i, irq_data->chip_data); - free_irq_cfg(irq_data->chip_data); + free_apic_chip_data(irq_data->chip_data); #ifdef CONFIG_X86_IO_APIC if (virq + i < nr_legacy_irqs()) - legacy_irq_cfgs[virq + i] = NULL; + legacy_irq_data[virq + i] = NULL; #endif irq_domain_reset_irq_data(irq_data); } @@ -275,9 +289,9 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_alloc_info *info = arg; + struct apic_chip_data *data; const struct cpumask *mask; struct irq_data *irq_data; - struct irq_cfg *cfg; int i, err; if (disable_apic) @@ -292,20 +306,20 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); #ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs() && legacy_irq_cfgs[virq + i]) - cfg = legacy_irq_cfgs[virq + i]; + if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) + data = legacy_irq_data[virq + i]; else #endif - cfg = alloc_irq_cfg(irq_data->node); - if (!cfg) { + data = alloc_apic_chip_data(irq_data->node); + if (!data) { err = -ENOMEM; goto error; } irq_data->chip = &lapic_controller; - irq_data->chip_data = cfg; + irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector(virq, cfg, mask); + err = assign_irq_vector(virq, data, mask); if (err) goto error; } @@ -349,22 +363,22 @@ int __init arch_probe_nr_irqs(void) static void init_legacy_irqs(void) { int i, node = cpu_to_node(0); - struct irq_cfg *cfg; + struct apic_chip_data *data; /* * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ for (i = 0; i < nr_legacy_irqs(); i++) { - cfg = legacy_irq_cfgs[i] = alloc_irq_cfg(node); - BUG_ON(!cfg); + data = legacy_irq_data[i] = alloc_apic_chip_data(node); + BUG_ON(!data); /* * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ - cfg->vector = IRQ0_VECTOR + i; - cpumask_setall(cfg->domain); - irq_set_chip_data(i, cfg); + data->cfg.vector = IRQ0_VECTOR + i; + cpumask_setall(data->domain); + irq_set_chip_data(i, data); } } #else @@ -390,7 +404,7 @@ static void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ int irq, vector; - struct irq_cfg *cfg; + struct apic_chip_data *data; /* * vector_lock will make sure that we don't run into irq vector @@ -400,13 +414,13 @@ static void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!data) continue; - if (!cpumask_test_cpu(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, data->domain)) continue; - vector = cfg->vector; + vector = data->cfg.vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -415,8 +429,8 @@ static void __setup_vector_irq(int cpu) if (irq <= VECTOR_UNDEFINED) continue; - cfg = irq_cfg(irq); - if (!cpumask_test_cpu(cpu, cfg->domain)) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!cpumask_test_cpu(cpu, data->domain)) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } raw_spin_unlock(&vector_lock); @@ -442,15 +456,15 @@ void setup_vector_irq(int cpu) __setup_vector_irq(cpu); } -static int apic_retrigger_irq(struct irq_data *data) +static int apic_retrigger_irq(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); + struct apic_chip_data *data = apic_chip_data(irq_data); unsigned long flags; int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(cfg->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); + cpu = cpumask_first_and(data->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -466,7 +480,7 @@ void apic_ack_edge(struct irq_data *data) static int apic_set_affinity(struct irq_data *irq_data, const struct cpumask *dest, bool force) { - struct irq_cfg *cfg = irq_data->chip_data; + struct apic_chip_data *data = irq_data->chip_data; int err, irq = irq_data->irq; if (!config_enabled(CONFIG_SMP)) @@ -475,11 +489,11 @@ static int apic_set_affinity(struct irq_data *irq_data, if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, cfg, dest); + err = assign_irq_vector(irq, data, dest); if (err) { struct irq_data *top = irq_get_irq_data(irq); - if (assign_irq_vector(irq, cfg, top->affinity)) + if (assign_irq_vector(irq, data, top->affinity)) pr_err("Failed to recover vector for irq %d\n", irq); return err; } @@ -494,28 +508,31 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -static void __send_cleanup_vector(struct irq_cfg *cfg) +static void __send_cleanup_vector(struct apic_chip_data *data) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + for_each_cpu_and(i, data->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } - cfg->move_in_progress = 0; + data->move_in_progress = 0; } void send_cleanup_vector(struct irq_cfg *cfg) { - if (cfg->move_in_progress) - __send_cleanup_vector(cfg); + struct apic_chip_data *data; + + data = container_of(cfg, struct apic_chip_data, cfg); + if (data->move_in_progress) + __send_cleanup_vector(data); } asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) @@ -531,7 +548,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) int irq; unsigned int irr; struct irq_desc *desc; - struct irq_cfg *cfg; + struct apic_chip_data *data; irq = __this_cpu_read(vector_irq[vector]); @@ -542,8 +559,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) if (!desc) continue; - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(&desc->irq_data); + if (!data) continue; raw_spin_lock(&desc->lock); @@ -552,10 +569,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) * Check if the irq migration is in progress. If so, we * haven't received the cleanup request yet for this irq. */ - if (cfg->move_in_progress) + if (data->move_in_progress) goto unlock; - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + if (vector == data->cfg.vector && + cpumask_test_cpu(me, data->domain)) goto unlock; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -581,14 +599,15 @@ unlock: static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { unsigned me; + struct apic_chip_data *data; - if (likely(!cfg->move_in_progress)) + data = container_of(cfg, struct apic_chip_data, cfg); + if (likely(!data->move_in_progress)) return; me = smp_processor_id(); - - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - __send_cleanup_vector(cfg); + if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) + __send_cleanup_vector(data); } void irq_complete_move(struct irq_cfg *cfg) @@ -600,10 +619,8 @@ void irq_force_complete_move(int irq) { struct irq_cfg *cfg = irq_cfg(irq); - if (!cfg) - return; - - __irq_complete_move(cfg, cfg->vector); + if (cfg) + __irq_complete_move(cfg, cfg->vector); } #endif -- cgit v1.2.1 From 4399b14fa75c8d8225a0739fbcef575f02c6c6a5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:04 +0800 Subject: x86/irq: Refine the way to calculate NR_IRQS Now we have made MSI independent of IOAPIC, so we need to refine the way to calculate NR_IRQS to support configuration with MSI enabled but IOAPIC disabled. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Jan Beulich Link: http://lkml.kernel.org/r/1428978610-28986-28-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_vectors.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 666c89ec4bd7..b26cb124a4f1 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -155,18 +155,22 @@ static inline int invalid_vm86_irq(int irq) * static arrays. */ -#define NR_IRQS_LEGACY 16 +#define NR_IRQS_LEGACY 16 -#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) +#define CPU_VECTOR_LIMIT (64 * NR_CPUS) +#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS) -#ifdef CONFIG_X86_IO_APIC -# define CPU_VECTOR_LIMIT (64 * NR_CPUS) -# define NR_IRQS \ +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI) +#define NR_IRQS \ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) -#else /* !CONFIG_X86_IO_APIC: */ -# define NR_IRQS NR_IRQS_LEGACY +#elif defined(CONFIG_X86_IO_APIC) +#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) +#elif defined(CONFIG_PCI_MSI) +#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT) +#else +#define NR_IRQS NR_IRQS_LEGACY #endif #endif /* _ASM_X86_IRQ_VECTORS_H */ -- cgit v1.2.1 From 46176f39b1a6f457eae78999befbdf58e68555e7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:05 +0800 Subject: x86/irq, ACPI: Remove private function mp_register_gsi()/ mp_unregister_gsi() Function mp_register_gsi() is only called once, so fold it into caller acpi_register_gsi_ioapic(). Do the same for mp_unregister_gsi(). Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Link: http://lkml.kernel.org/r/1428978610-28986-29-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 57 ++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 21e460b3b360..91a10120ed10 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -400,42 +400,6 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } -static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, - int polarity) -{ - int irq, node; - struct irq_alloc_info info; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; - polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; - node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - ioapic_set_alloc_attr(&info, node, trigger, polarity); - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); - if (irq < 0) - return irq; - - /* Don't set up the ACPI SCI because it's already set up */ - if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi) - mp_config_acpi_gsi(dev, gsi, trigger, polarity); - - return irq; -} - -static void mp_unregister_gsi(u32 gsi) -{ - int irq; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return; - - irq = mp_map_gsi_to_irq(gsi, 0, NULL); - if (irq > 0) - mp_unmap_irq(irq); -} - static struct irq_domain_ops acpi_irqdomain_ops = { .alloc = mp_irqdomain_alloc, .free = mp_irqdomain_free, @@ -662,10 +626,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int trigger, int polarity) { int irq = gsi; - #ifdef CONFIG_X86_IO_APIC + int node; + struct irq_alloc_info info; + + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + ioapic_set_alloc_attr(&info, node, trigger, polarity); + mutex_lock(&acpi_ioapic_lock); - irq = mp_register_gsi(dev, gsi, trigger, polarity); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + /* Don't set up the ACPI SCI because it's already set up */ + if (irq >= 0 && enable_update_mptable && + acpi_gbl_FADT.sci_interrupt != gsi) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); mutex_unlock(&acpi_ioapic_lock); #endif @@ -675,8 +650,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, static void acpi_unregister_gsi_ioapic(u32 gsi) { #ifdef CONFIG_X86_IO_APIC + int irq; + mutex_lock(&acpi_ioapic_lock); - mp_unregister_gsi(gsi); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); + if (irq > 0) + mp_unmap_irq(irq); mutex_unlock(&acpi_ioapic_lock); #endif } -- cgit v1.2.1 From 335efdf57da39d3949c3ef9338de5737e85cbe52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:06 +0800 Subject: x86, ioapic: Use proper defines for the entry fields While looking at the printout issue, I stumbled more than once over the various 0/1 assignments which are either commented in strange ways or force to lookup the meaning. Use proper constants and fix the misleading comments. While at it remove pointless 0 assignments in native_disable_io_apic() which have no value for understanding the code. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1428978610-28986-30-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 16 +++++-- arch/x86/kernel/apic/io_apic.c | 100 ++++++++++++++++++++--------------------- 2 files changed, 63 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index d58f1c6b5608..984afb732efe 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -98,9 +98,19 @@ struct IR_IO_APIC_route_entry { struct irq_alloc_info; struct irq_data; -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#define IOAPIC_MASKED 1 +#define IOAPIC_UNMASKED 0 + +#define IOAPIC_POL_HIGH 0 +#define IOAPIC_POL_LOW 1 + +#define IOAPIC_DEST_MODE_PHYSICAL 0 +#define IOAPIC_DEST_MODE_LOGICAL 1 + #define IOAPIC_MAP_ALLOC 0x1 #define IOAPIC_MAP_CHECK 0x2 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9806f9605bc4..a63167f96126 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -356,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_mask_entry(int apic, int pin) { unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; + union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); @@ -517,7 +517,7 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = 1; + entry1.mask = IOAPIC_MASKED; entry1.trigger = IOAPIC_EDGE; __ioapic_write_entry(apic, pin, entry1); @@ -553,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -567,7 +567,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (!entry.trigger) { + if (entry.trigger == IOAPIC_EDGE) { entry.trigger = IOAPIC_LEVEL; ioapic_write_entry(apic, pin, entry); } @@ -670,8 +670,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); } } @@ -773,11 +773,11 @@ static int EISA_ELCR(unsigned int irq) #endif -/* ISA interrupts are always polarity zero edge triggered, +/* ISA interrupts are always active high edge triggered, * when listed as conforming in the MP table. */ -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) +#define default_ISA_trigger(idx) (IOAPIC_EDGE) +#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) /* EISA interrupts are always polarity zero and can be edge or level * trigger depending on the ELCR value. If an interrupt is listed as @@ -787,11 +787,11 @@ static int EISA_ELCR(unsigned int irq) #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) #define default_EISA_polarity(idx) default_ISA_polarity(idx) -/* PCI interrupts are always polarity one level triggered, +/* PCI interrupts are always active low level triggered, * when listed as conforming in the MP table. */ -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) +#define default_PCI_trigger(idx) (IOAPIC_LEVEL) +#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) static int irq_polarity(int idx) { @@ -811,24 +811,24 @@ static int irq_polarity(int idx) break; case 1: /* high active */ { - polarity = 0; + polarity = IOAPIC_POL_HIGH; break; } case 2: /* reserved */ { pr_warn("broken BIOS!!\n"); - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } case 3: /* low active */ { - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } default: /* invalid */ { pr_warn("broken BIOS!!\n"); - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } } @@ -870,7 +870,7 @@ static int irq_trigger(int idx) default: { pr_warn("broken BIOS!!\n"); - trigger = 1; + trigger = IOAPIC_LEVEL; break; } } @@ -878,24 +878,24 @@ static int irq_trigger(int idx) break; case 1: /* edge */ { - trigger = 0; + trigger = IOAPIC_EDGE; break; } case 2: /* reserved */ { pr_warn("broken BIOS!!\n"); - trigger = 1; + trigger = IOAPIC_LEVEL; break; } case 3: /* level */ { - trigger = 1; + trigger = IOAPIC_LEVEL; break; } default: /* invalid */ { pr_warn("broken BIOS!!\n"); - trigger = 0; + trigger = IOAPIC_EDGE; break; } } @@ -939,11 +939,11 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, dst->ioapic_polarity = polarity; } else { /* - * PCI interrupts are always polarity one level + * PCI interrupts are always active low level * triggered. */ - dst->ioapic_trigger = 1; - dst->ioapic_polarity = 1; + dst->ioapic_trigger = IOAPIC_LEVEL; + dst->ioapic_polarity = IOAPIC_POL_LOW; } } } @@ -1296,9 +1296,10 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry = ioapic_read_entry(apic, i); snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", - i, entry.mask ? "disabled" : "enabled ", - entry.trigger ? "level" : "edge ", - entry.polarity ? "low " : "high", + i, + entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", + entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", + entry.polarity == IOAPIC_POL_LOW ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", @@ -1306,7 +1307,9 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", - buf, entry.dest_mode ? "logical " : "physical", + buf, + entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? + "logical " : "physical", entry.dest, entry.delivery_mode); } } @@ -1476,15 +1479,12 @@ void native_disable_io_apic(void) struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); + entry.mask = IOAPIC_UNMASKED; + entry.trigger = IOAPIC_EDGE; + entry.polarity = IOAPIC_POL_HIGH; + entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry.delivery_mode = dest_ExtINT; + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1494,7 +1494,6 @@ void native_disable_io_apic(void) if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(ioapic_i8259.pin != -1); - } /* @@ -2018,12 +2017,12 @@ static inline void __init unlock_ExtINT_logic(void) memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ + entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry1.mask = IOAPIC_UNMASKED; entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; - entry1.trigger = 0; + entry1.trigger = IOAPIC_EDGE; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2911,9 +2910,9 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, data->polarity = info->ioapic_polarity; } else if (acpi_get_override_irq(gsi, &data->trigger, &data->polarity) < 0) { - /* PCI interrupts are always polarity one level triggered. */ - data->trigger = 1; - data->polarity = 1; + /* PCI interrupts are always active low level triggered. */ + data->trigger = IOAPIC_LEVEL; + data->polarity = IOAPIC_POL_LOW; } } @@ -2925,15 +2924,16 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, entry->dest_mode = apic->irq_dest_mode; entry->dest = cfg->dest_apicid; entry->vector = cfg->vector; - entry->mask = 0; /* enable IRQ */ entry->trigger = data->trigger; entry->polarity = data->polarity; /* - * Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + * Mask level triggered irqs. Edge triggered irqs are masked + * by the irq core code in case they fire. */ - if (data->trigger) - entry->mask = 1; + if (data->trigger == IOAPIC_LEVEL) + entry->mask = IOAPIC_MASKED; + else + entry->mask = IOAPIC_UNMASKED; } int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, -- cgit v1.2.1 From ab76085ec0858d4c2707ea0d036db00ef4aee8fd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:07 +0800 Subject: x86,ioapic: Cleanup irq_trigger/polarity() These functions are full of pointless indentations, useless comments and even more useless printks. Clean them up. Signed-off-by: Thomas Gleixner Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-31-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: x86@kernel.org Signed-off-by: Jiang Liu --- arch/x86/kernel/apic/io_apic.c | 138 +++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 88 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a63167f96126..9fcca68b183d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -796,45 +796,47 @@ static int EISA_ELCR(unsigned int irq) static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; - int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = IOAPIC_POL_HIGH; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - polarity = IOAPIC_POL_LOW; - break; - } - case 3: /* low active */ - { - polarity = IOAPIC_POL_LOW; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - polarity = IOAPIC_POL_LOW; - break; - } + switch (mp_irqs[idx].irqflag & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + return default_ISA_polarity(idx); + else + return default_PCI_polarity(idx); + case 1: + return IOAPIC_POL_HIGH; + case 2: + pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_POL_LOW; } - return polarity; } +#ifdef CONFIG_EISA +static int eisa_irq_trigger(int idx, int bus, int trigger) +{ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_PCI: + case MP_BUS_ISA: + return trigger; + case MP_BUS_EISA: + return default_EISA_trigger(idx); + } + pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); + return IOAPIC_LEVEL; +} +#else +static inline int eisa_irq_trigger(int idx, int bus, int trigger) +{ + return trigger; +} +#endif + static int irq_trigger(int idx) { int bus = mp_irqs[idx].srcbus; @@ -843,63 +845,23 @@ static int irq_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#ifdef CONFIG_EISA - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - default: - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_LEVEL; - break; - } - } -#endif - break; - case 1: /* edge */ - { - trigger = IOAPIC_EDGE; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_LEVEL; - break; - } - case 3: /* level */ - { - trigger = IOAPIC_LEVEL; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_EDGE; - break; - } + switch ((mp_irqs[idx].irqflag >> 2) & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent trigger mode */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); + /* Take EISA into account */ + return eisa_irq_trigger(idx, bus, trigger); + case 1: + return IOAPIC_EDGE; + case 2: + pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_LEVEL; } - return trigger; } void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, -- cgit v1.2.1 From f7a0c78669ee79443a91ea89652766c1be8d9e04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:08 +0800 Subject: x86: Cleanup irq_domain ops We have 3 identical copies of the ioapic domain ops for acpi, mpparse, and sfi. Have a global one in the io_apic code and be done with it. To avoid include hell in io_apic.h, create a private irqdomain header and include the generic irqdomain header from there. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: sfi-devel@simplefirmware.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Grant Likely Cc: Rob Herring Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1428978610-28986-32-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 29 ++--------------------------- arch/x86/include/asm/irqdomain.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/acpi/boot.c | 13 +++---------- arch/x86/kernel/apic/io_apic.c | 9 ++++++++- arch/x86/kernel/devicetree.c | 12 ++++++------ arch/x86/kernel/mpparse.c | 9 +-------- arch/x86/platform/sfi/sfi.c | 10 ++-------- 7 files changed, 56 insertions(+), 60 deletions(-) create mode 100644 arch/x86/include/asm/irqdomain.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 984afb732efe..6cbf2cfb3f8a 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -96,7 +96,7 @@ struct IR_IO_APIC_route_entry { } __attribute__ ((packed)); struct irq_alloc_info; -struct irq_data; +struct ioapic_domain_cfg; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -163,23 +163,6 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); -enum ioapic_domain_type { - IOAPIC_DOMAIN_INVALID, - IOAPIC_DOMAIN_LEGACY, - IOAPIC_DOMAIN_STRICT, - IOAPIC_DOMAIN_DYNAMIC, -}; - -struct device_node; -struct irq_domain; -struct irq_domain_ops; - -struct ioapic_domain_cfg { - enum ioapic_domain_type type; - const struct irq_domain_ops *ops; - struct device_node *dev; -}; - extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, @@ -189,15 +172,7 @@ extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); extern int mp_unregister_ioapic(u32 gsi_base); extern int mp_ioapic_registered(u32 gsi_base); -extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg); -extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs); -extern void mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data); -extern void mp_irqdomain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data); -extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); + extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity); diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h new file mode 100644 index 000000000000..fe0d4c6636ec --- /dev/null +++ b/arch/x86/include/asm/irqdomain.h @@ -0,0 +1,34 @@ +#ifndef _ASM_IRQDOMAIN_H +#define _ASM_IRQDOMAIN_H + +#include + +enum ioapic_domain_type { + IOAPIC_DOMAIN_INVALID, + IOAPIC_DOMAIN_LEGACY, + IOAPIC_DOMAIN_STRICT, + IOAPIC_DOMAIN_DYNAMIC, +}; + +struct device_node; +struct irq_data; + +struct ioapic_domain_cfg { + enum ioapic_domain_type type; + const struct irq_domain_ops *ops; + struct device_node *dev; +}; + +extern const struct irq_domain_ops mp_ioapic_irqdomain_ops; + +extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg); +extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs); +extern void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data); +extern void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data); +extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); + +#endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 91a10120ed10..cb9f6f12246b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,12 +31,12 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -400,20 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } -static struct irq_domain_ops acpi_irqdomain_ops = { - .alloc = mp_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, -}; - static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic = (struct acpi_madt_io_apic *)header; @@ -764,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) u64 addr; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9fcca68b183d..845dc0df2002 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -41,13 +41,13 @@ #include #include #include -#include #include #include #include /* time_after() */ #include #include +#include #include #include #include @@ -2995,3 +2995,10 @@ int mp_irqdomain_ioapic_idx(struct irq_domain *domain) { return (int)(long)domain->host_data; } + +const struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 05103d398ed7..5ee771859b6f 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -17,6 +16,7 @@ #include #include +#include #include #include #include @@ -216,11 +216,11 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } -const struct irq_domain_ops ioapic_irq_domain_ops = { - .alloc = dt_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, +static const struct irq_domain_ops ioapic_irq_domain_ops = { + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init dtb_add_ioapic(struct device_node *dn) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index aa4feee74dbe..30ca7607cbbb 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,8 +19,8 @@ #include #include #include -#include +#include #include #include #include @@ -113,13 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m) pr_warn("Unknown bustype %s - ignoring\n", str); } -static struct irq_domain_ops mp_ioapic_irqdomain_ops = { - .alloc = mp_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, -}; - static void __init MP_ioapic_info(struct mpc_ioapic *m) { struct ioapic_domain_cfg cfg = { diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index b66b194f9900..6c7111bbd1e9 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -25,8 +25,8 @@ #include #include #include -#include +#include #include #include #include @@ -71,12 +71,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC -static struct irq_domain_ops sfi_ioapic_irqdomain_ops = { - .alloc = mp_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, -}; static int __init sfi_parse_ioapic(struct sfi_table_header *table) { @@ -85,7 +79,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table) int i, num; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_STRICT, - .ops = &sfi_ioapic_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; sb = (struct sfi_table_simple *)table; -- cgit v1.2.1 From d746d1ebd30c48562a3fb512ab18d5822f137820 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:09 +0800 Subject: x86/irq: Move irqdomain specific code into asm/irqdomain.h Now we have dedicated asm/irqdomain.h, so move irqdomain specific code into it. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1428978610-28986-33-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 24 ------------------------ arch/x86/include/asm/irq_remapping.h | 2 +- arch/x86/include/asm/irqdomain.h | 35 ++++++++++++++++++++++++++++++++--- arch/x86/kernel/apic/htirq.c | 2 +- arch/x86/kernel/apic/msi.c | 2 +- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/hpet.c | 2 +- arch/x86/platform/uv/uv_irq.c | 2 +- 8 files changed, 38 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 3b8233a26348..1f88e719fa78 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -94,8 +94,6 @@ extern void trace_call_function_single_interrupt(void); #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi #endif /* CONFIG_TRACING */ -struct irq_domain; - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; @@ -165,22 +163,11 @@ struct irq_alloc_info { }; }; -enum { - /* Allocate contiguous CPU vectors */ - X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, -}; - struct irq_cfg { unsigned int dest_apicid; u8 vector; }; -extern struct irq_domain *x86_vector_domain; - -extern void init_irq_alloc_info(struct irq_alloc_info *info, - const struct cpumask *mask); -extern void copy_irq_alloc_info(struct irq_alloc_info *dst, - struct irq_alloc_info *src); extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); @@ -200,17 +187,6 @@ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} #endif /* CONFIG_X86_LOCAL_APIC */ -#ifdef CONFIG_PCI_MSI -extern void arch_init_msi_domain(struct irq_domain *domain); -#else -static inline void arch_init_msi_domain(struct irq_domain *domain) { } -#endif -#ifdef CONFIG_HT_IRQ -extern void arch_init_htirq_domain(struct irq_domain *domain); -#else -static inline void arch_init_htirq_domain(struct irq_domain *domain) { } -#endif - /* Statistics */ extern atomic_t irq_err_count; extern atomic_t irq_mis_count; diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 09efa358c831..78974fbc33b4 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -22,7 +22,7 @@ #ifndef __X86_IRQ_REMAPPING_H #define __X86_IRQ_REMAPPING_H -#include +#include #include #include diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index fe0d4c6636ec..d26075b52885 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -2,6 +2,25 @@ #define _ASM_IRQDOMAIN_H #include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +enum { + /* Allocate contiguous CPU vectors */ + X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, +}; + +extern struct irq_domain *x86_vector_domain; + +extern void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask); +extern void copy_irq_alloc_info(struct irq_alloc_info *dst, + struct irq_alloc_info *src); +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +struct device_node; +struct irq_data; enum ioapic_domain_type { IOAPIC_DOMAIN_INVALID, @@ -10,9 +29,6 @@ enum ioapic_domain_type { IOAPIC_DOMAIN_DYNAMIC, }; -struct device_node; -struct irq_data; - struct ioapic_domain_cfg { enum ioapic_domain_type type; const struct irq_domain_ops *ops; @@ -30,5 +46,18 @@ extern void mp_irqdomain_activate(struct irq_domain *domain, extern void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data); extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); +#endif /* CONFIG_X86_IO_APIC */ + +#ifdef CONFIG_PCI_MSI +extern void arch_init_msi_domain(struct irq_domain *domain); +#else +static inline void arch_init_msi_domain(struct irq_domain *domain) { } +#endif + +#ifdef CONFIG_HT_IRQ +extern void arch_init_htirq_domain(struct irq_domain *domain); +#else +static inline void arch_init_htirq_domain(struct irq_domain *domain) { } +#endif #endif diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 1cae104415ea..341e99be42b8 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 109584261c4e..58fde664e7c0 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 60047495041c..ad786f8a7cc7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index e3bc18080052..e2449cf38b06 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -11,8 +11,8 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 091b36ac44c4..cdf86cd3fd97 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -12,8 +12,8 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.1 From f7fa7aeeecb7a9abdd5f5d069a71ffb3e99a2a07 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:10 +0800 Subject: x86/irq: Avoid memory allocation in __assign_irq_vector() Function __assign_irq_vector() is protected by vector_lock, so use a global temporary cpu_mask to avoid allocating/freeing cpu_mask. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-34-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index ad786f8a7cc7..1c7dd42b98c1 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -30,6 +30,7 @@ struct apic_chip_data { struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); +static cpumask_var_t vector_cpumask; static struct irq_chip lapic_controller; #ifdef CONFIG_X86_IO_APIC static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; @@ -116,14 +117,10 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 16; int cpu, err; - cpumask_var_t tmp_mask; if (d->move_in_progress) return -EBUSY; - if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) - return -ENOMEM; - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; cpumask_clear(d->old_domain); @@ -131,21 +128,22 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, while (cpu < nr_cpu_ids) { int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask, mask); + apic->vector_allocation_domain(cpu, vector_cpumask, mask); - if (cpumask_subset(tmp_mask, d->domain)) { + if (cpumask_subset(vector_cpumask, d->domain)) { err = 0; - if (cpumask_equal(tmp_mask, d->domain)) + if (cpumask_equal(vector_cpumask, d->domain)) break; /* * New cpumask using the vector is a proper subset of * the current in use mask. So cleanup the vector * allocation for the members that are not used anymore. */ - cpumask_andnot(d->old_domain, d->domain, tmp_mask); + cpumask_andnot(d->old_domain, d->domain, + vector_cpumask); d->move_in_progress = cpumask_intersects(d->old_domain, cpu_online_mask); - cpumask_and(d->domain, d->domain, tmp_mask); + cpumask_and(d->domain, d->domain, vector_cpumask); break; } @@ -159,16 +157,18 @@ next: } if (unlikely(current_vector == vector)) { - cpumask_or(d->old_domain, d->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, d->old_domain); - cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + cpumask_or(d->old_domain, d->old_domain, + vector_cpumask); + cpumask_andnot(vector_cpumask, mask, d->old_domain); + cpu = cpumask_first_and(vector_cpumask, + cpu_online_mask); continue; } if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) { if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) goto next; @@ -181,14 +181,13 @@ next: d->move_in_progress = cpumask_intersects(d->old_domain, cpu_online_mask); } - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; d->cfg.vector = vector; - cpumask_copy(d->domain, tmp_mask); + cpumask_copy(d->domain, vector_cpumask); err = 0; break; } - free_cpumask_var(tmp_mask); if (!err) { /* cache destination APIC IDs into cfg->dest_apicid */ @@ -397,6 +396,8 @@ int __init arch_early_irq_init(void) arch_init_msi_domain(x86_vector_domain); arch_init_htirq_domain(x86_vector_domain); + BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); + return arch_early_ioapic_init(); } -- cgit v1.2.1 From 21a6dd5b391e419efb227251745b08d5838f09db Mon Sep 17 00:00:00 2001 From: firo yang Date: Thu, 23 Apr 2015 18:12:10 +0800 Subject: crypto: sha1-mb - Remove pointless cast Since kzalloc() returns a void pointer, we don't need to cast the return value in arch/x86/crypto/sha-mb/sha1_mb.c::sha1_mb_mod_init(). Signed-off-by: Firo Yang Signed-off-by: Herbert Xu --- arch/x86/crypto/sha-mb/sha1_mb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index e510b1c5d690..0cb5149518ce 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -885,7 +885,8 @@ static int __init sha1_mb_mod_init(void) INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher); cpu_state->cpu = cpu; cpu_state->alg_state = &sha1_mb_alg_state; - cpu_state->mgr = (struct sha1_ctx_mgr *) kzalloc(sizeof(struct sha1_ctx_mgr), GFP_KERNEL); + cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr), + GFP_KERNEL); if (!cpu_state->mgr) goto err2; sha1_ctx_mgr_init(cpu_state->mgr); -- cgit v1.2.1 From 4545c89880138b30a868159bc1b209867b8a5f32 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 27 Apr 2015 13:17:19 +0200 Subject: x86: introduce kaslr_offset() Offset that has been chosen for kaslr during kernel decompression can be easily computed as a difference between _text and __START_KERNEL. We are already making use of this in dump_kernel_offset() notifier and in arch_crash_save_vmcoreinfo(). Introduce kaslr_offset() that makes this computation instead of hard-coding it, so that other kernel code (such as live patching) can make use of it. Also convert existing users to make use of it. This patch is equivalent transofrmation without any effects on the resulting code: $ diff -u vmlinux.old.asm vmlinux.new.asm --- vmlinux.old.asm 2015-04-28 17:55:19.520983368 +0200 +++ vmlinux.new.asm 2015-04-28 17:55:24.141206072 +0200 @@ -1,5 +1,5 @@ -vmlinux.old: file format elf64-x86-64 +vmlinux.new: file format elf64-x86-64 Disassembly of section .text: $ Acked-by: Borislav Petkov Signed-off-by: Jiri Kosina --- arch/x86/include/asm/setup.h | 6 ++++++ arch/x86/kernel/machine_kexec_64.c | 3 ++- arch/x86/kernel/setup.c | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f69e06b283fb..785ac2f27271 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -65,12 +65,18 @@ static inline void x86_ce4100_early_setup(void) { } * This is set up by the setup-routine at boot-time */ extern struct boot_params boot_params; +extern char _text[]; static inline bool kaslr_enabled(void) { return !!(boot_params.hdr.loadflags & KASLR_FLAG); } +static inline unsigned long kaslr_offset(void) +{ + return (unsigned long)&_text - __START_KERNEL; +} + /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 415480d3ea84..e1029633f664 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { @@ -334,7 +335,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", - (unsigned long)&_text - __START_KERNEL); + kaslr_offset()); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d74ac33290ae..5056d3cfe266 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -834,7 +834,7 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { if (kaslr_enabled()) { pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", - (unsigned long)&_text - __START_KERNEL, + kaslr_offset(), __START_KERNEL, __START_KERNEL_map, MODULES_VADDR-1); -- cgit v1.2.1 From 5d4351ba654c2f25eb4f6883db742a16bccbb36b Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 27 Apr 2015 13:25:23 +0200 Subject: livepatch: x86: make kASLR logic more accurate We give up old_addr hint from the coming patch module in cases when kernel load base has been randomized (as in such case, the coming module has no idea about the exact randomization offset). We are currently too pessimistic, and give up immediately as soon as CONFIG_RANDOMIZE_BASE is set; this doesn't however directly imply that the load base has actually been randomized. There are config options that disable kASLR (such as hibernation), user could have disabled kaslr on kernel command-line, etc. The loader propagates the information whether kernel has been randomized through bootparams. This allows us to have the condition more accurate. On top of that, it seems unnecessary to give up old_addr hints even if randomization is active. The relocation offset can be computed using kaslr_ofsset(), and therefore old_addr can be adjusted accordingly. Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- arch/x86/include/asm/livepatch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 2d29197bd2fb..19c099afa861 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -21,6 +21,7 @@ #ifndef _ASM_X86_LIVEPATCH_H #define _ASM_X86_LIVEPATCH_H +#include #include #include -- cgit v1.2.1 From 535b3ddc285825c058cef3436a9aa207edffa6cd Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 29 Apr 2015 18:09:53 +0200 Subject: x86: kaslr: fix build due to missing ALIGN definition Fengguang's bot reported that 4545c898 ("x86: introduce kaslr_offset()") broke randconfig build In file included from arch/x86/xen/vga.c:5:0: arch/x86/include/asm/setup.h: In function 'kaslr_offset': >> arch/x86/include/asm/setup.h:77:2: error: implicit declaration of function 'ALIGN' [-Werror=implicit-function-declaration] return (unsigned long)&_text - __START_KERNEL; ^ Fix that by making setup.h self-sufficient by explicitly including linux/kernel.h, which is needed for ALIGN() (which is what __START_KERNEL contains in its expansion). Reported-by: fengguang.wu@intel.com Signed-off-by: Jiri Kosina --- arch/x86/include/asm/setup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 785ac2f27271..11af24e09c8a 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -60,6 +60,7 @@ static inline void x86_ce4100_early_setup(void) { } #ifndef _SETUP #include +#include /* * This is set up by the setup-routine at boot-time -- cgit v1.2.1 From 0bb549052d33f8992544764a6cf1299d06ba7e2f Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Tue, 28 Apr 2015 18:44:31 -0400 Subject: efi: Add esrt support Add sysfs files for the EFI System Resource Table (ESRT) under /sys/firmware/efi/esrt and for each EFI System Resource Entry under entries/ as a subdir. The EFI System Resource Table (ESRT) provides a read-only catalog of system components for which the system accepts firmware upgrades via UEFI's "Capsule Update" feature. This module allows userland utilities to evaluate what firmware updates can be applied to this system, and potentially arrange for those updates to occur. The ESRT is described as part of the UEFI specification, in version 2.5 which should be available from http://uefi.org/specifications in early 2015. If you're a member of the UEFI Forum, information about its addition to the standard is available as UEFI Mantis 1090. For some hardware platforms, additional restrictions may be found at http://msdn.microsoft.com/en-us/library/windows/hardware/jj128256.aspx , and additional documentation may be found at http://download.microsoft.com/download/5/F/5/5F5D16CD-2530-4289-8019-94C6A20BED3C/windows-uefi-firmware-update-platform.docx . Signed-off-by: Peter Jones Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 02744df576d5..3b984c3aa1b0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -501,6 +501,8 @@ void __init efi_init(void) if (efi_enabled(EFI_DBG)) print_efi_memmap(); + + efi_esrt_init(); } void __init efi_late_init(void) -- cgit v1.2.1 From eb18cf55c299d2ac5c8b5421c58b6c582a044475 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2015 11:10:11 +0200 Subject: x86: Constify irqdomain ops Nothing changes those ops. Make the initializers readable while at it. Reported-by: Krzysztof Kozlowski Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 10 +++++----- arch/x86/kernel/apic/vector.c | 6 +++--- arch/x86/platform/uv/uv_irq.c | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 341e99be42b8..ae50d3454d78 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -143,11 +143,11 @@ static void htirq_domain_deactivate(struct irq_domain *domain, write_ht_irq_msg(irq_data->irq, &msg); } -static struct irq_domain_ops htirq_domain_ops = { - .alloc = htirq_domain_alloc, - .free = htirq_domain_free, - .activate = htirq_domain_activate, - .deactivate = htirq_domain_deactivate, +static const struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, }; void arch_init_htirq_domain(struct irq_domain *parent) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1c7dd42b98c1..426496862be0 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -330,9 +330,9 @@ error: return err; } -static struct irq_domain_ops x86_vector_domain_ops = { - .alloc = x86_vector_alloc_irqs, - .free = x86_vector_free_irqs, +static const struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, }; int __init arch_probe_nr_irqs(void) diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index cdf86cd3fd97..8570abe68be1 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -149,11 +149,11 @@ static void uv_domain_deactivate(struct irq_domain *domain, uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); } -static struct irq_domain_ops uv_domain_ops = { - .alloc = uv_domain_alloc, - .free = uv_domain_free, - .activate = uv_domain_activate, - .deactivate = uv_domain_deactivate, +static const struct irq_domain_ops uv_domain_ops = { + .alloc = uv_domain_alloc, + .free = uv_domain_free, + .activate = uv_domain_activate, + .deactivate = uv_domain_deactivate, }; static struct irq_domain *uv_get_irq_domain(void) -- cgit v1.2.1 From 781674fc33adf0d975a361e111bb45804356aa23 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 4 May 2015 17:58:00 +0200 Subject: x86/x2apic: Acpi_gbl_FADT existence depends on CONFIG_ACPI If ACPI is disabled, acpi_gbl_FADT is not available, and and the build breaks. Signed-off-by: Jan Kiszka Link: http://lkml.kernel.org/r/55479708.2000104@siemens.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/x2apic_phys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6fae733e9194..3ffd925655e0 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode); static bool x2apic_fadt_phys(void) { +#ifdef CONFIG_ACPI if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "System requires x2apic physical mode\n"); return true; } +#endif return false; } -- cgit v1.2.1 From 19e3d60d49f05a9de0ef06c60703f31d4acd5f17 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 4 May 2015 17:58:01 +0200 Subject: x86: Let x2APIC support depend on interrupt remapping or guest support We are able to use x2APIC mode in the absence of interrupt remapping on certain hypervisors. So it is fine to disable IRQ_REMAP without having to give up x2APIC support. Signed-off-by: Jan Kiszka Link: http://lkml.kernel.org/r/55479709.4030901@siemens.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c86fdc13615f..3c17c04ce5a7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -340,7 +340,7 @@ config X86_FEATURE_NAMES config X86_X2APIC bool "Support x2apic" - depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP + depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST) ---help--- This enables x2apic support on CPUs that have this feature. -- cgit v1.2.1 From fd91784beb91239a697f855b6cda5035c1c5d6a7 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 12 Apr 2015 09:14:45 -0400 Subject: x86: Merge common 32-bit values in asm-offsets.c Merge common values for 32-bit native and compat. Signed-off-by: Brian Gerst Cc: Denys Vlasenko Acked-by: Andy Lutomirski Link: http://lkml.kernel.org/r/1428844486-6638-1-git-send-email-brgerst@gmail.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/asm-offsets.c | 19 +++++++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 15 --------------- arch/x86/kernel/asm-offsets_64.c | 21 --------------------- 3 files changed, 19 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9f6b9341950f..b27f6ec90caa 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -41,6 +41,25 @@ void common(void) { OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + BLANK(); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); + + BLANK(); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + + BLANK(); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 47703aed74cf..628bfd4c06bb 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -17,17 +17,6 @@ void foo(void); void foo(void) { - OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); - OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); - OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); - OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); - OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); - OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); - OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); - OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); - OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); - BLANK(); - OFFSET(CPUINFO_x86, cpuinfo_x86, x86); OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); @@ -37,7 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_cpu, thread_info, cpu); BLANK(); @@ -60,9 +48,6 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); - BLANK(); - OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 5ce6f2da8763..dcaab87da629 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -29,27 +29,6 @@ int main(void) BLANK(); #endif -#ifdef CONFIG_IA32_EMULATION - OFFSET(TI_sysenter_return, thread_info, sysenter_return); - BLANK(); - -#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) - ENTRY(ax); - ENTRY(bx); - ENTRY(cx); - ENTRY(dx); - ENTRY(si); - ENTRY(di); - ENTRY(bp); - ENTRY(sp); - ENTRY(ip); - BLANK(); -#undef ENTRY - - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); - BLANK(); -#endif - #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(cx); -- cgit v1.2.1 From c07e5a542e41b87583a8bc0e2849d14bbe919be0 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 12 Apr 2015 09:14:46 -0400 Subject: x86: Remove unused TI_cpu Signed-off-by: Brian Gerst Cc: Denys Vlasenko Acked-by: Andy Lutomirski Link: http://lkml.kernel.org/r/1428844486-6638-2-git-send-email-brgerst@gmail.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/asm-offsets_32.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 628bfd4c06bb..6ce39025f467 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -26,9 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_cpu, thread_info, cpu); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); -- cgit v1.2.1 From 0c7965ff22472f60d5cf07308cb59f3a118feb8a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 13 Apr 2015 13:54:32 +0200 Subject: x86: Deinline dma_alloc_attrs() Reduces kernel size by 68739 bytes on allyesconfig build: text data bss dec hex filename 82662736 22255384 20627456 125545576 77bac68 vmlinux0 82594029 22255352 20627456 125476837 77a9fe5 vmlinux1 Signed-off-by: Denys Vlasenko Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Andi Kleen Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1428926075-28796-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/dma-mapping.h | 28 ++-------------------------- arch/x86/kernel/pci-dma.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 808dae63eeea..63393202b9ec 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -127,33 +127,9 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) #define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) -static inline void * +void * dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - void *memory; - - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - - if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) - dev = &x86_dma_fallback_dev; - - if (!is_device_dma_capable(dev)) - return NULL; - - if (!ops->alloc) - return NULL; - - memory = ops->alloc(dev, size, dma_handle, - dma_alloc_coherent_gfp_flags(dev, gfp), attrs); - debug_dma_alloc_coherent(dev, size, *dma_handle, memory); - - return memory; -} + gfp_t gfp, struct dma_attrs *attrs); #define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a25e202bb319..f9f7656e1d41 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -140,6 +140,34 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + void *memory; + + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) + return memory; + + if (!dev) + dev = &x86_dma_fallback_dev; + + if (!is_device_dma_capable(dev)) + return NULL; + + if (!ops->alloc) + return NULL; + + memory = ops->alloc(dev, size, dma_handle, + dma_alloc_coherent_gfp_flags(dev, gfp), attrs); + debug_dma_alloc_coherent(dev, size, *dma_handle, memory); + + return memory; +} +EXPORT_SYMBOL(dma_alloc_attrs); + /* * See for the iommu kernel * parameter documentation. -- cgit v1.2.1 From f1dc154f82595386cddcc7b980d8760474c3dd2d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 13 Apr 2015 13:54:33 +0200 Subject: x86: Deinline dma_free_attrs() Reduces kernel size by 76720 bytes on allyesconfig build: text data bss dec hex filename 82594029 22255352 20627456 125476837 77a9fe5 vmlinux1 82517277 22255384 20627456 125400117 7797435 vmlinux2 Signed-off-by: Denys Vlasenko Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Andi Kleen Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1428926075-28796-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/dma-mapping.h | 18 +++--------------- arch/x86/kernel/pci-dma.c | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 63393202b9ec..1f5b7287d1ad 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -133,20 +133,8 @@ dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, #define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) -static inline void dma_free_attrs(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - WARN_ON(irqs_disabled()); /* for portability */ - - if (dma_release_from_coherent(dev, get_order(size), vaddr)) - return; - - debug_dma_free_coherent(dev, size, vaddr, bus); - if (ops->free) - ops->free(dev, size, vaddr, bus, attrs); -} +void dma_free_attrs(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus, + struct dma_attrs *attrs); #endif diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f9f7656e1d41..353972c1946c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -168,6 +168,23 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dma_alloc_attrs); +void dma_free_attrs(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + WARN_ON(irqs_disabled()); /* for portability */ + + if (dma_release_from_coherent(dev, get_order(size), vaddr)) + return; + + debug_dma_free_coherent(dev, size, vaddr, bus); + if (ops->free) + ops->free(dev, size, vaddr, bus, attrs); +} +EXPORT_SYMBOL(dma_free_attrs); + /* * See for the iommu kernel * parameter documentation. -- cgit v1.2.1 From 84be456f883c4685680fba8e5154b5f72e92957e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 May 2015 12:46:15 +0200 Subject: remove We don't have any arch specific scatterlist now that parisc switched over to the generic one. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- arch/x86/include/asm/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 4e370a5d8117..81da003df21b 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.1 From 1222e564cf4394af0b3c5e8a73330b20862c068b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 May 2015 06:23:59 +0200 Subject: x86/platform/uv: Make SGI UV dependent on CONFIG_PCI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent PCI changes stopped exporting PCI constants if !CONFIG_PCI, which made the UV build fail: arch/x86/kernel/apic/x2apic_uv_x.c:843:16: error: ‘PCI_VGA_STATE_CHANGE_BRIDGE’ undeclared (first use in this function) arch/x86/kernel/apic/x2apic_uv_x.c:1023:2: error: implicit declaration of function ‘pci_register_set_vga_state’ [-Werror=implicit-function-declaration] As it's unlikely that an UV bootup will get far without PCI enumeration, make the platform Kconfig switch (CONFIG_X86_UV) depend on CONFIG_PCI=y. Cc: Robin Holt Cc: Dimitri Sivanich Cc: Russ Anderson Cc: Mike Travis Cc: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..066d9bdc5496 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -441,6 +441,7 @@ config X86_UV depends on X86_EXTENDED_PLATFORM depends on NUMA depends on X86_X2APIC + depends on PCI ---help--- This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. -- cgit v1.2.1 From f5d6a52f511157c7476590532a23b5664b1ed877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Mon, 4 May 2015 11:42:34 +0200 Subject: x86/smpboot: Skip delays during SMP initialization similar to Xen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the per-CPU delays during SMP initialization, which seems to be possible on newer architectures with an x2APIC. Xen does this since 2011. In fact, this commit is basically a combination of the following Xen commits. The first removes the delays, the second fixes an issue with the removal: commit 68fce206f6dba9981e8322269db49692c95ce250 Author: Tim Deegan Date: Tue Jul 19 14:13:01 2011 +0100 x86: Remove timeouts from INIT-SIPI-SIPI sequence when using x2apic. Some of the timeouts are pointless since they're waiting for the ICR to ack the IPI delivery and that doesn't happen on x2apic. The others should be benign (and are suggested in the SDM) but removing them makes AP bringup much more reliable on some test boxes. Signed-off-by: Tim Deegan commit f12ee533150761df5a7099c83f2a5fa6c07d1187 Author: Gang Wei Date: Thu Dec 29 10:07:54 2011 +0000 X86: Add a delay between INIT & SIPIs for tboot AP bring-up in X2APIC case Without this delay, Xen could not bring APs up while working with TXT/tboot, because tboot needs some time in APs to handle INIT before becoming ready for receiving SIPIs (this delay was removed as part of c/s 23724 by Tim Deegan). Signed-off-by: Gang Wei Acked-by: Keir Fraser Acked-by: Tim Deegan Committed-by: Tim Deegan Signed-off-by: Jan H. Schönherr Cc: Anthony Liguori Cc: Borislav Petkov Cc: Gang Wei Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Deegan Link: http://lkml.kernel.org/r/1430732554-7294-1-git-send-email-jschoenh@amazon.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 61 +++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..63b46414c80c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -555,7 +555,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { - unsigned long send_status, accept_status = 0; + unsigned long send_status = 0, accept_status = 0; int maxlvt, num_starts, j; maxlvt = lapic_get_maxlvt(); @@ -580,22 +580,34 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + if (!cpu_has_x2apic) { + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mdelay(10); + mdelay(10); - pr_debug("Deasserting INIT\n"); + pr_debug("Deasserting INIT\n"); - /* Target chip */ - /* Send IPI */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + /* Target chip */ + /* Send IPI */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mb(); - atomic_set(&init_deasserted, 1); + mb(); + atomic_set(&init_deasserted, 1); + } else if (tboot_enabled()) { + /* + * With tboot AP is actually spinning in a mini-guest before + * receiving INIT. Upon receiving INIT ipi, AP need time to + * VMExit, update VMCS to tracking SIPIs and VMResume. + * + * While AP is in root mode handling the INIT the CPU will drop + * any SIPIs + */ + udelay(10); + } /* * Should we send STARTUP IPIs ? @@ -637,20 +649,23 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); + if (!cpu_has_x2apic) { + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); - pr_debug("Startup point 1\n"); + pr_debug("Startup point 1\n"); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + } - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); -- cgit v1.2.1 From d9ee948d82203811a545ba26b0172fce4970d1dc Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 17 Dec 2014 18:05:29 -0800 Subject: x86/asm: Use -mskip-rax-setup if supported GCC 5 added a compiler option, -mskip-rax-setup, for x86-64. It skips setting up the RAX register when SSE is disabled and there are no variable arguments passed in vector registers. (According to the x86_64 ABI, %al is used as a hidden register containing the number of vector registers used). Since the kernel doesn't pass vector registers to functions with variable arguments, this option can be used to optimize the x86-64 kernel. This GCC feature was suggested by Rasmus Villemoes . This is the corresponding kernel change using it. For kernel v3.17: text data bss dec filename 11455921 2204048 5853184 19513153 vmlinux #with -mskip-rax-setup 11480079 2204048 5853184 19537311 vmlinux For Kernel v4.0+ - custom config: text data bss dec filename 10231778 3479800 16617472 30329050 vmlinux-gcc5+-mskip-rax-setup 10268797 3547448 16621568 30437813 vmlinux Signed-off-by: H.J. Lu Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rasmus Villemoes Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5ba2d9ce82dc..40af1bac2b7d 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -84,6 +84,9 @@ else # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) + # Use -mskip-rax-setup if supported. + KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) -- cgit v1.2.1 From 1b4574292e9d2d37b3bb437c9e778fd2bba8e170 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Tue, 7 Apr 2015 16:46:37 -0500 Subject: x86/gart: Check for GART support before accessing GART registers GART registers are not present in newer AMD processors (Fam15h, Model 10h and later). So, avoid accessing those in PCI config space by returning early in early_gart_iommu_check() and gart_iommu_hole_init() if GART is not available. Current code doesn't break on existing processors but there are some side effects: We get bogus AGP aperture messages which are simply noise on GART-less processors: AGP: Node 0: aperture [bus addr 0x00000000-0x01ffffff] (32MB) AGP: Your BIOS doesn't leave aperture memory hole AGP: Please enable the IOMMU option in the BIOS setup AGP: This costs you 64MB of RAM AGP: Mapping aperture over RAM [mem 0xd4000000-0xd7ffffff] We can avoid calling allocate_aperture() and would not have to wastefully reserve 64MB of RAM with memblock_reserve(). Also, we can avoid having to loop through all PCI buses and devices twice, searching for a non-existent AGP bridge if we bail out early. Refactor the family check used in amd_nb.c into an inline function so we can use it here as well as in amd_nb.c Fix some typos while at it. Tested the patch on Fam10h and Fam15h Model 00h-fh and this code runs fine. On Fam15h Model 60h-6fh and on Fam16h, we bail early as they don't have GART. Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Borislav Petkov Reviewed-by: Suravee Suthikulpanit Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Joerg Rodel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1428443197-3834-1-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/amd_nb.h | 11 +++++++++++ arch/x86/kernel/amd_nb.c | 4 +--- arch/x86/kernel/aperture_64.c | 8 +++++++- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index aaac3b2fb746..1a5da2e63aee 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -98,11 +98,22 @@ static inline u16 amd_get_node_id(struct pci_dev *pdev) return 0; } +static inline bool amd_gart_present(void) +{ + /* GART present only on Fam15h, upto model 0fh */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) + return true; + + return false; +} + #else #define amd_nb_num(x) 0 #define amd_nb_has_feature(x) false #define node_to_amd_nb(x) NULL +#define amd_gart_present(x) false #endif diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 5caed1dd7ccf..29fa475ec518 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -89,9 +89,7 @@ int amd_cache_northbridges(void) next_northbridge(link, amd_nb_link_ids); } - /* GART present only on Fam15h upto model 0fh */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) + if (amd_gart_present()) amd_northbridges.flags |= AMD_NB_GART; /* diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 76164e173a24..6e85f713641d 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -262,6 +262,9 @@ void __init early_gart_iommu_check(void) u64 aper_base = 0, last_aper_base = 0; int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0; + if (!amd_gart_present()) + return; + if (!early_pci_allowed()) return; @@ -355,6 +358,9 @@ int __init gart_iommu_hole_init(void) int fix, slot, valid_agp = 0; int i, node; + if (!amd_gart_present()) + return -ENODEV; + if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return -ENODEV; @@ -452,7 +458,7 @@ out: force_iommu || valid_agp || fallback_aper_force) { - pr_info("Your BIOS doesn't leave a aperture memory hole\n"); + pr_info("Your BIOS doesn't leave an aperture memory hole\n"); pr_info("Please enable the IOMMU option in the BIOS setup\n"); pr_info("This costs you %dMB of RAM\n", 32 << fallback_aper_order); -- cgit v1.2.1 From b9d16a2a21aa9c264a29dd84d6f7b03581517a03 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 27 Apr 2015 10:25:51 -0500 Subject: x86/cpu/amd: Set X86_FEATURE_EXTD_APICID for future processors Decision to use a 4-bit mask or 8-bit mask in default_get_apic_id() is controlled by setting capability bit X86_FEATURE_EXTD_APICID. Currently, we detect extended APIC ID support by accessing Link Transaction Control register D18F0x68 in PCI config space. But, not even that is needed as we can safely postulate that future AMD processors will support 8-bit APIC IDs and we can simply set that feature bit on them, without the PCI access. Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jacob Shin Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: dave.hansen@linux.intel.com Cc: hecmargi@upv.es Cc: mgorman@suse.de Link: http://lkml.kernel.org/r/1430148351-9013-1-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e4cf63301ff4..94e7051fba1a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -520,8 +520,16 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) - /* check CPU config space for extended APIC ID */ - if (cpu_has_apic && c->x86 >= 0xf) { + /* + * ApicID can always be treated as an 8-bit value for AMD APIC versions + * >= 0x10, but even old K8s came out of reset with version 0x10. So, we + * can safely set X86_FEATURE_EXTD_APICID unconditionally for families + * after 16h. + */ + if (cpu_has_apic && c->x86 > 0x16) { + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } else if (cpu_has_apic && c->x86 >= 0xf) { + /* check CPU config space for extended APIC ID */ unsigned int val; val = read_pci_config(0, 24, 0, 0x68); if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) -- cgit v1.2.1 From a1a32d29f941b7219be07f9e76455a5e4ce4e9c4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 10 Apr 2015 12:50:57 +0200 Subject: x86/microcode/intel: Get rid of revision_is_newer() It is a one-liner for checking microcode header revisions. On top of that, it can be used wrong as it was the case in _save_mc(). Get rid of it. Signed-off-by: Borislav Petkov Acked-by: Quentin Casasnovas Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode_intel.h | 6 ------ arch/x86/kernel/cpu/microcode/intel_early.c | 2 +- arch/x86/kernel/cpu/microcode/intel_lib.c | 6 +++--- 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 2b9209c46ca9..a4df6d292228 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -60,12 +60,6 @@ extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc) extern int microcode_sanity_check(void *mc, int print_err); extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc); -static inline int -revision_is_newer(struct microcode_header_intel *mc_header, int rev) -{ - return (mc_header->rev <= rev) ? 0 : 1; -} - #ifdef CONFIG_MICROCODE_INTEL_EARLY extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 2f49ab4ac0ae..a394350e6092 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -262,7 +262,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, found = 1; - if (!revision_is_newer(mc_hdr, new_rev)) + if (mc_hdr->rev <= mc_saved_hdr->rev) continue; /* diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index cd47a510a3f1..63b0a2e059ee 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -154,13 +154,13 @@ int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc) /* * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc) +int get_matching_microcode(unsigned int csig, int cpf, int new_rev, void *mc) { struct microcode_header_intel *mc_hdr = mc; - if (!revision_is_newer(mc_hdr, rev)) + if (mc_hdr->rev <= new_rev) return 0; - return get_matching_sig(csig, cpf, rev, mc); + return get_matching_sig(csig, cpf, new_rev, mc); } EXPORT_SYMBOL_GPL(get_matching_microcode); -- cgit v1.2.1 From da9b50765e6ea3e9113df3a14a63700e47a670b7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 10 Apr 2015 14:32:10 +0200 Subject: x86/microcode/intel: Remove unused @rev arg of get_matching_sig() @rev wasn't used in get_matching_sig(), drop it. Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode_intel.h | 2 +- arch/x86/kernel/cpu/microcode/intel_early.c | 5 ++--- arch/x86/kernel/cpu/microcode/intel_lib.c | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index a4df6d292228..3564299863f0 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -58,7 +58,7 @@ struct extended_sigtable { extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc); extern int microcode_sanity_check(void *mc, int print_err); -extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc); +extern int get_matching_sig(unsigned int csig, int cpf, void *mc); #ifdef CONFIG_MICROCODE_INTEL_EARLY extern void __init load_ucode_intel_bsp(void); diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index a394350e6092..45881e92f1be 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -246,7 +246,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, unsigned int num_saved) { struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - unsigned int sig, pf, new_rev; + unsigned int sig, pf; int found = 0, i; mc_hdr = (struct microcode_header_intel *)ucode_ptr; @@ -255,9 +255,8 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; sig = mc_saved_hdr->sig; pf = mc_saved_hdr->pf; - new_rev = mc_hdr->rev; - if (!get_matching_sig(sig, pf, new_rev, ucode_ptr)) + if (!get_matching_sig(sig, pf, ucode_ptr)) continue; found = 1; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 63b0a2e059ee..7de293726923 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check); /* * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc) +int get_matching_sig(unsigned int csig, int cpf, void *mc) { struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header; @@ -161,6 +161,6 @@ int get_matching_microcode(unsigned int csig, int cpf, int new_rev, void *mc) if (mc_hdr->rev <= new_rev) return 0; - return get_matching_sig(csig, cpf, new_rev, mc); + return get_matching_sig(csig, cpf, mc); } EXPORT_SYMBOL_GPL(get_matching_microcode); -- cgit v1.2.1 From 760d765b2bb662be177d4b5b271ced8debc803ac Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 18 Mar 2015 19:28:56 +0100 Subject: x86/microcode: Parse built-in microcode early Apparently, people do build microcode into the kernel image, i.e. CONFIG_FIRMWARE_IN_KERNEL=y. Make that work in the early loader which is where microcode should be preferably loaded anyway. Note that you need to specify the microcode filename with the path relative to the toplevel firmware directory (the same like the late loading method) in CONFIG_EXTRA_FIRMWARE=y so that early loader can find it. I.e., something like this (Intel variant): CONFIG_FIRMWARE_IN_KERNEL=y CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09" CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware/" While at it, add me to the loader copyright boilerplate. Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Daniel J Blueman Cc: H. Peter Anvin Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode.h | 8 +++++++- arch/x86/include/asm/microcode_amd.h | 4 ++-- arch/x86/kernel/cpu/microcode/amd_early.c | 19 ++++++++++++++++--- arch/x86/kernel/cpu/microcode/core_early.c | 23 ++++++++++++++++++++++- arch/x86/kernel/cpu/microcode/intel_early.c | 23 +++++++++++++++++++++-- 5 files changed, 68 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 2fb20d6f7e23..9e6278c7140e 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H +#include + #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = native_read_msr((msr)); \ @@ -152,6 +154,7 @@ extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); void reload_early_microcode(void); +extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); #else static inline void __init load_ucode_bsp(void) {} static inline void load_ucode_ap(void) {} @@ -160,6 +163,9 @@ static inline int __init save_microcode_in_initrd(void) return 0; } static inline void reload_early_microcode(void) {} +static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) +{ + return false; +} #endif - #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index af935397e053..b8438543f340 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -65,12 +65,12 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; #ifdef CONFIG_MICROCODE_AMD_EARLY -extern void __init load_ucode_amd_bsp(void); +extern void __init load_ucode_amd_bsp(int family); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); void reload_ucode_amd(void); #else -static inline void __init load_ucode_amd_bsp(void) {} +static inline void __init load_ucode_amd_bsp(int family) {} static inline void load_ucode_amd_ap(void) {} static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } void reload_ucode_amd(void) {} diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 737737edbd1e..9208a36d0f03 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -228,7 +228,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) } } -void __init load_ucode_amd_bsp(void) +static bool __init load_builtin_amd_microcode(struct cpio_data *cp, int family) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + + if (family >= 0x15) + snprintf(fw_name, sizeof(fw_name), + "amd-ucode/microcode_amd_fam%.2xh.bin", family); + + return get_builtin_firmware(cp, fw_name); +} + +void __init load_ucode_amd_bsp(int family) { struct cpio_data cp; void **data; @@ -243,8 +254,10 @@ void __init load_ucode_amd_bsp(void) #endif cp = find_ucode_in_initrd(); - if (!cp.data) - return; + if (!cp.data) { + if (!load_builtin_amd_microcode(&cp, family)) + return; + } *data = cp.data; *size = cp.size; diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index a413a69cbd74..979ca78b1561 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -3,6 +3,7 @@ * * Copyright (C) 2012 Fenghua Yu * H Peter Anvin" + * (C) 2015 Borislav Petkov * * This driver allows to early upgrade microcode on Intel processors * belonging to IA-32 family - PentiumPro, Pentium II, @@ -17,6 +18,7 @@ * 2 of the License, or (at your option) any later version. */ #include +#include #include #include #include @@ -43,6 +45,25 @@ static bool __init check_loader_disabled_bsp(void) return *res; } +extern struct builtin_fw __start_builtin_fw[]; +extern struct builtin_fw __end_builtin_fw[]; + +bool get_builtin_firmware(struct cpio_data *cd, const char *name) +{ +#ifdef CONFIG_FW_LOADER + struct builtin_fw *b_fw; + + for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { + if (!strcmp(name, b_fw->name)) { + cd->size = b_fw->size; + cd->data = b_fw->data; + return true; + } + } +#endif + return false; +} + void __init load_ucode_bsp(void) { int vendor, family; @@ -63,7 +84,7 @@ void __init load_ucode_bsp(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - load_ucode_amd_bsp(); + load_ucode_amd_bsp(family); break; default: break; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 45881e92f1be..ccd474a7a59e 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -521,6 +521,23 @@ out: EXPORT_SYMBOL_GPL(save_mc_for_early); #endif +static bool __init load_builtin_intel_microcode(struct cpio_data *cp) +{ + u32 eax = 0x00000001, ebx, ecx = 0, edx; + int family, model, stepping; + char name[30]; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + family = __x86_family(eax); + model = x86_model(eax); + stepping = eax & 0xf; + + sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping); + + return get_builtin_firmware(cp, name); +} + static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, @@ -539,8 +556,10 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, cd.size = 0; cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) - return UCODE_ERROR; + if (!cd.data) { + if (!load_builtin_intel_microcode(&cd)) + return UCODE_ERROR; + } return get_matching_model_microcode(0, start, cd.data, cd.size, mc_saved_data, initrd, uci); -- cgit v1.2.1 From 5b673a48c54594108aec368014efc7334743f06a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 4 Apr 2015 16:40:45 +0200 Subject: x86/alternatives: Document macros Add some text to the macro magic for future reference and against failing human memory. Requested-by: Ingo Molnar Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative-asm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index bdf02eeee765..e7636bac7372 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -18,6 +18,12 @@ .endm #endif +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ .macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . @@ -27,6 +33,12 @@ .byte \pad_len .endm +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ .macro ALTERNATIVE oldinstr, newinstr, feature 140: \oldinstr @@ -55,6 +67,12 @@ */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + +/* + * Same as ALTERNATIVE macro above but for two alternatives. If CPU + * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has + * @feature2, it replaces @oldinstr with @feature2. + */ .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 140: \oldinstr -- cgit v1.2.1 From afdf344e08fbec28ab2204a626fa1f260dcb68be Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:53 -0500 Subject: x86/mce/amd: Factor out logging mechanism Refactor the code here to setup struct mce and call mce_log() to log the error. We're going to reuse this in a later patch as part of the deferred error interrupt enablement. No functional change is introduced. Suggested-by: Borislav Petkov Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-2-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 55ad9b37cae8..5f25de20db36 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -264,6 +264,27 @@ init: } } +static void __log_error(unsigned int bank, bool threshold_err, u64 misc) +{ + struct mce m; + u64 status; + + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + if (!(status & MCI_STATUS_VAL)) + return; + + mce_setup(&m); + + m.status = status; + m.bank = bank; + if (threshold_err) + m.misc = misc; + + mce_log(&m); + + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); +} + /* * APIC Interrupt Handler */ @@ -273,12 +294,12 @@ init: * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ + static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; int cpu = smp_processor_id(); unsigned int bank, block; - struct mce m; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -321,15 +342,7 @@ static void amd_threshold_interrupt(void) return; log: - mce_setup(&m); - rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); - if (!(m.status & MCI_STATUS_VAL)) - return; - m.misc = ((u64)high << 32) | low; - m.bank = bank; - mce_log(&m); - - wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); + __log_error(bank, true, ((u64)high << 32) | low); } /* -- cgit v1.2.1 From 6e6e746e33e9555a7fce159d25314c9df3bcda93 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:54 -0500 Subject: x86/mce/amd: Collect valid address before logging an error amd_decode_mce() needs value in m->addr so it can report the error address correctly. This should be setup in __log_error() before we call mce_log(). We do this because the error address is an important bit of information which should be conveyed to userspace. The correct output then reports proper address, like this: [Hardware Error]: Corrected error, no action required. [Hardware Error]: CPU:0 (15:60:0) MC0_STATUS [-|CE|-|-|AddrV|-|-|CECC]: 0x840041000028017b [Hardware Error]: MC0 Error Address: 0x00001f808f0ff040 Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5f25de20db36..607075726e10 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -277,11 +277,14 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc) m.status = status; m.bank = bank; + if (threshold_err) m.misc = misc; - mce_log(&m); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr); + mce_log(&m); wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } -- cgit v1.2.1 From 7559e13fb4abe7880dfaf985d6a1630ca90a67ce Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:55 -0500 Subject: x86/mce: Add support for deferred errors on AMD Deferred errors indicate error conditions that were not corrected, but those errors have not been consumed yet. They require no action from S/W (or action is optional). These errors provide info about a latent uncorrectable MCE that can occur when a poisoned data is consumed by the processor. Newer AMD processors can generate deferred errors and can be configured to generate APIC interrupts on such events. SUCCOR stands for S/W UnCorrectable error COntainment and Recovery. It indicates support for data poisoning in HW and deferred error interrupts. Add new bitfield to mce_vendor_flags for this. We use this to verify presence of deferred error interrupts before we enable them in mce_amd.c While at it, clarify comments in mce_vendor_flags to provide an indication of usages of the bitfields. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-4-git-send-email-Aravind.Gopalakrishnan@amd.com [ beef up commit message, do CPUID(8000_0007) only once. ] Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 15 +++++++++++++-- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 1f5a86d518db..407ced642ac1 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -117,8 +117,19 @@ struct mca_config { }; struct mce_vendor_flags { - __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ - __reserved_0 : 63; + /* + * overflow recovery cpuid bit indicates that overflow + * conditions are not fatal + */ + __u64 overflow_recov : 1, + + /* + * SUCCOR stands for S/W UnCorrectable error COntainment + * and Recovery. It indicates support for data poisoning + * in HW and deferred error interrupts. + */ + succor : 1, + __reserved_0 : 62; }; extern struct mce_vendor_flags mce_flags; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..521e5016aca6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1637,10 +1637,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); mce_adjust_timer = cmci_intel_adjust_timer; break; - case X86_VENDOR_AMD: + + case X86_VENDOR_AMD: { + u32 ebx = cpuid_ebx(0x80000007); + mce_amd_feature_init(c); - mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; + mce_flags.overflow_recov = !!(ebx & BIT(0)); + mce_flags.succor = !!(ebx & BIT(1)); break; + } + default: break; } -- cgit v1.2.1 From 507224aa88fc2c11f252705ad286bca155e0df02 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Sun, 12 Apr 2015 17:37:24 +0200 Subject: serial: 8250: remove Kconfig indirection Remove CONFIG_SERIAL_DETECT_IRQ and CONFIG_SERIAL_MANY_PORTS, and substitute all references to the proper 8250 Kconfig options. Now, the actual Kconfig dependencies are not hidden when reading the code and static analyzers are less confused. Signed-off-by: Valentin Rothberg Reviewed-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/serial.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h index 8378b8c9109c..bb658211edad 100644 --- a/arch/x86/include/asm/serial.h +++ b/arch/x86/include/asm/serial.h @@ -11,7 +11,7 @@ #define BASE_BAUD (1843200/16) /* Standard COM flags (except for COM4, because of the 8514 problem) */ -#ifdef CONFIG_SERIAL_DETECT_IRQ +#ifdef CONFIG_SERIAL_8250_DETECT_IRQ # define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ) # define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | UPF_AUTO_IRQ) #else -- cgit v1.2.1 From 24fd78a81f6d3fe7f7a440c8629f9c52cd5f830e Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:56 -0500 Subject: x86/mce/amd: Introduce deferred error interrupt handler Deferred errors indicate error conditions that were not corrected, but require no action from S/W (or action is optional).These errors provide info about a latent UC MCE that can occur when a poisoned data is consumed by the processor. Processors that report these errors can be configured to generate APIC interrupts to notify OS about the error. Provide an interrupt handler in this patch so that OS can catch these errors as and when they happen. Currently, we simply log the errors and exit the handler as S/W action is not mandated. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-5-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/entry_arch.h | 3 ++ arch/x86/include/asm/hardirq.h | 3 ++ arch/x86/include/asm/hw_irq.h | 2 + arch/x86/include/asm/irq_vectors.h | 1 + arch/x86/include/asm/mce.h | 3 ++ arch/x86/include/asm/trace/irq_vectors.h | 6 +++ arch/x86/include/asm/traps.h | 3 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 93 ++++++++++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 ++ arch/x86/kernel/irq.c | 6 +++ arch/x86/kernel/irqinit.c | 4 ++ arch/x86/kernel/traps.c | 5 ++ 12 files changed, 133 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index dc5fa661465f..6da46dbaac87 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -50,4 +50,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_AMD +BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) +#endif #endif diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 0f5fb6b6567e..db9f536f482f 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -33,6 +33,9 @@ typedef struct { #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif +#ifdef CONFIG_X86_MCE_AMD + unsigned int irq_deferred_error_count; +#endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e9571ddabc4f..f71e489d7537 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -73,6 +73,7 @@ extern asmlinkage void invalidate_interrupt31(void); extern asmlinkage void irq_move_cleanup_interrupt(void); extern asmlinkage void reboot_interrupt(void); extern asmlinkage void threshold_interrupt(void); +extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); @@ -87,6 +88,7 @@ extern void trace_spurious_interrupt(void); extern void trace_thermal_interrupt(void); extern void trace_reschedule_interrupt(void); extern void trace_threshold_interrupt(void); +extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 666c89ec4bd7..026fc1e1599c 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -113,6 +113,7 @@ #define IRQ_WORK_VECTOR 0xf6 #define UV_BAU_MESSAGE 0xf5 +#define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 407ced642ac1..6a3034a0a072 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -234,6 +234,9 @@ void do_machine_check(struct pt_regs *, long); extern void (*mce_threshold_vector)(void); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* Deferred error interrupt handler */ +extern void (*deferred_error_int_vector)(void); + /* * Thermal handler */ diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 4cab890007a7..38a09a13a9bc 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -100,6 +100,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single); */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +/* + * deferred_error_apic - called when entering/exiting a deferred apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); + /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 4e49d7dff78e..c5380bea2a36 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); -asmlinkage void mce_threshold_interrupt(void); +asmlinkage void smp_threshold_interrupt(void); +asmlinkage void smp_deferred_error_interrupt(void); #endif extern enum ctx_state ist_enter(struct pt_regs *regs); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 607075726e10..2e7ebe7e1e80 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -12,6 +12,8 @@ * - added support for AMD Family 0x10 processors * May 2012 * - major scrubbing + * May 2015 + * - add support for deferred error interrupts (Aravind Gopalakrishnan) * * All MC4_MISCi registers are shared between multi-cores */ @@ -32,6 +34,7 @@ #include #include #include +#include #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -47,6 +50,13 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -60,6 +70,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; /* * CPU Initialization @@ -205,6 +222,39 @@ static int setup_APIC_mce(int reserved, int new) return reserved; } +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + wrmsr(MSR_CU_DEF_ERR, low, high); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -262,6 +312,9 @@ init: mce_threshold_block_init(&b, offset); } } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); } static void __log_error(unsigned int bank, bool threshold_err, u64 misc) @@ -288,6 +341,46 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc) wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } +static inline void __smp_deferred_error_interrupt(void) +{ + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); +} + +asmlinkage __visible void smp_deferred_error_interrupt(void) +{ + entering_irq(); + __smp_deferred_error_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +{ + entering_irq(); + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + __smp_deferred_error_interrupt(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + u64 status; + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + + if (!(status & MCI_STATUS_VAL) || + !(status & MCI_STATUS_DEFERRED)) + continue; + + __log_error(bank, false, 0); + break; + } +} + /* * APIC Interrupt Handler */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 02c2eff7478d..12aea85fe738 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -935,6 +935,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt #endif +#ifdef CONFIG_X86_MCE_AMD +apicinterrupt DEFERRED_ERROR_VECTOR \ + deferred_error_interrupt smp_deferred_error_interrupt +#endif + #ifdef CONFIG_X86_THERMAL_VECTOR apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e5952c225532..590ed6c1bf51 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -116,6 +116,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_puts(p, " Threshold APIC interrupts\n"); #endif +#ifdef CONFIG_X86_MCE_AMD + seq_printf(p, "%*s: ", prec, "DFR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); + seq_puts(p, " Deferred Error APIC interrupts\n"); +#endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..d7ec6e7b2b5b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -135,6 +135,10 @@ static void __init apic_intr_init(void) alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#ifdef CONFIG_X86_MCE_AMD + alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); +#endif + #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 324ab5247687..68b1d5979a46 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -827,6 +827,11 @@ asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) { } +asmlinkage __visible void __attribute__((weak)) +smp_deferred_error_interrupt(void) +{ +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task -- cgit v1.2.1 From 5c0d728e1a8ccbaf68ec37181e466627ba0a6efc Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:57 -0500 Subject: x86/irq: Cleanup ordering of vector numbers Sort vector number assignments in proper descending order. No functional change. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Link: http://lkml.kernel.org/r/1430913538-1415-6-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/irq_vectors.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 026fc1e1599c..8900ba444ad7 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,11 +102,6 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 -/* Vector for KVM to deliver posted interrupt IPI */ -#ifdef CONFIG_HAVE_KVM -#define POSTED_INTR_VECTOR 0xf2 -#endif - /* * IRQ work vector: */ @@ -118,6 +113,11 @@ /* Vector on which hypervisor callbacks will be delivered */ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 +/* Vector for KVM to deliver posted interrupt IPI */ +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * Local APIC timer IRQ vector is on a different priority level, * to work around the 'lost local interrupt if more than 2 IRQ -- cgit v1.2.1 From 868c00bb5980653c44d931384baa2c7f1bde81ef Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:58 -0500 Subject: x86/mce/amd: Rename setup_APIC_mce 'setup_APIC_mce' doesn't give us an indication of why we are going to program LVT. Make that explicit by renaming it to setup_APIC_mce_threshold so we know. No functional change is introduced. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-7-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 2e7ebe7e1e80..70e1bf6f784d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -213,7 +213,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) threshold_restart_bank(&tr); }; -static int setup_APIC_mce(int reserved, int new) +static int setup_APIC_mce_threshold(int reserved, int new) { if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) @@ -302,7 +302,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.interrupt_enable = 1; new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); + offset = setup_APIC_mce_threshold(offset, new); if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) -- cgit v1.2.1 From 956079e081427fe0c929eb284ab7e39f9b8e2023 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 6 May 2015 12:15:54 -0700 Subject: x86/platform/atom/punit: Add Punit device state debug driver The patch adds a debug driver, which dumps the power states of all the North complex (NC) devices. This debug interface is useful to figure out the devices, which blocks the S0ix transitions on the platform. This is extremely useful during enabling PM on customer platforms and derivatives. This submission is based on the submission from Mahesh Kumar P: https://lkml.org/lkml/2014/11/5/367 Signed-off-by: Srinivas Pandruvada Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Mahesh Kumar P Cc: Thomas Gleixner Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1430939754-6900-2-git-send-email-srinivas.pandruvada@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 11 ++ arch/x86/platform/Makefile | 1 + arch/x86/platform/atom/Makefile | 1 + arch/x86/platform/atom/punit_atom_debug.c | 183 ++++++++++++++++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 arch/x86/platform/atom/Makefile create mode 100644 arch/x86/platform/atom/punit_atom_debug.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 72484a645f05..a5973f851750 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -332,4 +332,15 @@ config X86_DEBUG_STATIC_CPU_HAS If unsure, say N. +config PUNIT_ATOM_DEBUG + tristate "ATOM Punit debug driver" + select DEBUG_FS + select IOSF_MBI + ---help--- + This is a debug driver, which gets the power states + of all Punit North Complex devices. The power states of + each device is exposed as part of the debugfs interface. + The current power state can be read from + /sys/kernel/debug/punit_atom/dev_power_state + endmenu diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index a62e0be3a2f1..f1a6c8e86ddd 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,4 +1,5 @@ # Platform specific code goes here +obj-y += atom/ obj-y += ce4100/ obj-y += efi/ obj-y += geode/ diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile new file mode 100644 index 000000000000..0a3a40cbc794 --- /dev/null +++ b/arch/x86/platform/atom/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c new file mode 100644 index 000000000000..5ca8ead91579 --- /dev/null +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -0,0 +1,183 @@ +/* + * Intel SOC Punit device state debug driver + * Punit controls power management for North Complex devices (Graphics + * blocks, Image Signal Processing, video processing, display, DSP etc.) + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Side band Interface port */ +#define PUNIT_PORT 0x04 +/* Power gate status reg */ +#define PWRGT_STATUS 0x61 +/* Subsystem config/status Video processor */ +#define VED_SS_PM0 0x32 +/* Subsystem config/status ISP (Image Signal Processor) */ +#define ISP_SS_PM0 0x39 +/* Subsystem config/status Input/output controller */ +#define MIO_SS_PM 0x3B +/* Shift bits for getting status for video, isp and i/o */ +#define SSS_SHIFT 24 +/* Shift bits for getting status for graphics rendering */ +#define RENDER_POS 0 +/* Shift bits for getting status for media control */ +#define MEDIA_POS 2 +/* Shift bits for getting status for Valley View/Baytrail display */ +#define VLV_DISPLAY_POS 6 +/* Subsystem config/status display for Cherry Trail SOC */ +#define CHT_DSP_SSS 0x36 +/* Shift bits for getting status for display */ +#define CHT_DSP_SSS_POS 16 + +struct punit_device { + char *name; + int reg; + int sss_pos; +}; + +static const struct punit_device punit_device_byt[] = { + { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, + { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, + { "DISPLAY", PWRGT_STATUS, VLV_DISPLAY_POS }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + +static const struct punit_device punit_device_cht[] = { + { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, + { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, + { "DISPLAY", CHT_DSP_SSS, CHT_DSP_SSS_POS }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + +static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"}; + +static int punit_dev_state_show(struct seq_file *seq_file, void *unused) +{ + u32 punit_pwr_status; + struct punit_device *punit_devp = seq_file->private; + int index; + int status; + + seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n"); + while (punit_devp->name) { + status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ, + punit_devp->reg, + &punit_pwr_status); + if (status) { + seq_printf(seq_file, "%9s : Read Failed\n", + punit_devp->name); + } else { + index = (punit_pwr_status >> punit_devp->sss_pos) & 3; + seq_printf(seq_file, "%9s : %s\n", punit_devp->name, + dstates[index]); + } + punit_devp++; + } + + return 0; +} + +static int punit_dev_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, punit_dev_state_show, inode->i_private); +} + +static const struct file_operations punit_dev_state_ops = { + .open = punit_dev_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *punit_dbg_file; + +static int punit_dbgfs_register(struct punit_device *punit_device) +{ + static struct dentry *dev_state; + + punit_dbg_file = debugfs_create_dir("punit_atom", NULL); + if (!punit_dbg_file) + return -ENXIO; + + dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO, + punit_dbg_file, punit_device, + &punit_dev_state_ops); + if (!dev_state) { + pr_err("punit_dev_state register failed\n"); + debugfs_remove(punit_dbg_file); + return -ENXIO; + } + + return 0; +} + +static void punit_dbgfs_unregister(void) +{ + debugfs_remove_recursive(punit_dbg_file); +} + +#define ICPU(model, drv_data) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\ + (kernel_ulong_t)&drv_data } + +static const struct x86_cpu_id intel_punit_cpu_ids[] = { + ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */ + ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */ + {} +}; + +MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids); + +static int __init punit_atom_debug_init(void) +{ + const struct x86_cpu_id *id; + int ret; + + id = x86_match_cpu(intel_punit_cpu_ids); + if (!id) + return -ENODEV; + + ret = punit_dbgfs_register((struct punit_device *)id->driver_data); + if (ret < 0) + return ret; + + return 0; +} + +static void __exit punit_atom_debug_exit(void) +{ + punit_dbgfs_unregister(); +} + +module_init(punit_atom_debug_init); +module_exit(punit_atom_debug_exit); + +MODULE_AUTHOR("Kumar P, Mahesh "); +MODULE_AUTHOR("Srinivas Pandruvada "); +MODULE_DESCRIPTION("Driver for Punit devices states debugging"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.1 From ff7bbb9c6ab6e6620429daeff39424bbde1a94b4 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Thu, 23 Apr 2015 17:12:42 -0400 Subject: kvmclock: set scheduler clock stable If you try to enable NOHZ_FULL on a guest today, you'll get the following error when the guest tries to deactivate the scheduler tick: WARNING: CPU: 3 PID: 2182 at kernel/time/tick-sched.c:192 can_stop_full_tick+0xb9/0x290() NO_HZ FULL will not work with unstable sched clock CPU: 3 PID: 2182 Comm: kworker/3:1 Not tainted 4.0.0-10545-gb9bb6fb #204 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Workqueue: events flush_to_ldisc ffffffff8162a0c7 ffff88011f583e88 ffffffff814e6ba0 0000000000000002 ffff88011f583ed8 ffff88011f583ec8 ffffffff8104d095 ffff88011f583eb8 0000000000000000 0000000000000003 0000000000000001 0000000000000001 Call Trace: [] dump_stack+0x4f/0x7b [] warn_slowpath_common+0x85/0xc0 [] warn_slowpath_fmt+0x46/0x50 [] can_stop_full_tick+0xb9/0x290 [] tick_nohz_irq_exit+0x8d/0xb0 [] irq_exit+0xc5/0x130 [] smp_apic_timer_interrupt+0x4a/0x60 [] apic_timer_interrupt+0x6e/0x80 [] ? _raw_spin_unlock_irqrestore+0x31/0x60 [] __wake_up+0x48/0x60 [] n_tty_receive_buf_common+0x49c/0xba0 [] ? tty_ldisc_ref+0x1f/0x70 [] n_tty_receive_buf2+0x14/0x20 [] flush_to_ldisc+0xe0/0x120 [] process_one_work+0x1d5/0x540 [] ? process_one_work+0x151/0x540 [] worker_thread+0x121/0x470 [] ? process_one_work+0x540/0x540 [] kthread+0xef/0x110 [] ? __kthread_parkme+0xa0/0xa0 [] ret_from_fork+0x42/0x70 [] ? __kthread_parkme+0xa0/0xa0 ---[ end trace 06e3507544a38866 ]--- However, it turns out that kvmclock does provide a stable sched_clock callback. So, let the scheduler know this which in turn makes NOHZ_FULL work in the guest. Signed-off-by: Marcelo Tosatti Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 42caaef897c8..4e03921761c4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -265,6 +266,8 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + set_sched_clock_stable(); } int __init kvm_setup_vsyscall_timeinfo(void) -- cgit v1.2.1 From ccf73aaf5adfa37f45be12459c17f534e8f2c2c5 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 30 Apr 2015 13:43:31 +0200 Subject: KVM: arm/mips/x86/power use __kvm_guest_{enter|exit} Use __kvm_guest_{enter|exit} instead of kvm_guest_{enter|exit} where interrupts are disabled. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c73efcd03e29..7a959be0aebc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6347,7 +6347,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); - kvm_guest_enter(); + __kvm_guest_enter(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); -- cgit v1.2.1 From 90de4a1875180f8347c075319af2cce586c96ab6 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 13 Apr 2015 01:53:41 +0300 Subject: KVM: x86: Support for disabling quirks Introducing KVM_CAP_DISABLE_QUIRKS for disabling x86 quirks that were previous created in order to overcome QEMU issues. Those issue were mostly result of invalid VM BIOS. Currently there are two quirks that can be disabled: 1. KVM_QUIRK_LINT0_REENABLED - LINT0 was enabled after boot 2. KVM_QUIRK_CD_NW_CLEARED - CD and NW are cleared after boot These two issues are already resolved in recent releases of QEMU, and would therefore be disabled by QEMU. Signed-off-by: Nadav Amit Message-Id: <1428879221-29996-1-git-send-email-namit@cs.technion.ac.il> [Report capability from KVM_CHECK_EXTENSION too. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/uapi/asm/kvm.h | 3 +++ arch/x86/kvm/lapic.c | 5 +++-- arch/x86/kvm/svm.c | 3 ++- arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 40 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dea2e7e962e3..f80ad591aa61 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -635,6 +635,8 @@ struct kvm_arch { #endif bool boot_vcpu_runs_old_kvmclock; + + u64 disabled_quirks; }; struct kvm_vm_stat { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d7dcef58aefa..2fec75e4b1e1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -345,4 +345,7 @@ struct kvm_xcrs { struct kvm_sync_regs { }; +#define KVM_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_QUIRK_CD_NW_CLEARED (1 << 1) + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 629af0f1c5c4..4071eb161c8f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1577,8 +1577,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic->lapic_timer.timer_mode = 0; - apic_set_reg(apic, APIC_LVT0, - SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); + if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED)) + apic_set_reg(apic, APIC_LVT0, + SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce741b8650f6..46299dac7c6d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1575,7 +1575,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * does not do it - this results in some delay at * reboot */ - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED)) + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7a959be0aebc..0435b653f583 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2800,6 +2800,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: + case KVM_CAP_ENABLE_CAP_VM: + case KVM_CAP_DISABLE_QUIRKS: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3847,6 +3849,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, return 0; } +static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_DISABLE_QUIRKS: + kvm->arch.disabled_quirks = cap->args[0]; + r = 0; + break; + default: + r = -EINVAL; + break; + } + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4099,7 +4121,15 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap(kvm, &cap); + break; + } default: r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); } -- cgit v1.2.1 From d28bc9dd25ce023270d2e039e7c98d38ecbf7758 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 13 Apr 2015 14:34:08 +0300 Subject: KVM: x86: INIT and reset sequences are different x86 architecture defines differences between the reset and INIT sequences. INIT does not initialize the FPU (including MMX, XMM, YMM, etc.), TSC, PMU, MSRs (in general), MTRRs machine-check, APIC ID, APIC arbitration ID and BSP. References (from Intel SDM): "If the MP protocol has completed and a BSP is chosen, subsequent INITs (either to a specific processor or system wide) do not cause the MP protocol to be repeated." [8.4.2: MP Initialization Protocol Requirements and Restrictions] [Table 9-1. IA-32 Processor States Following Power-up, Reset, or INIT] "If the processor is reset by asserting the INIT# pin, the x87 FPU state is not changed." [9.2: X87 FPU INITIALIZATION] "The state of the local APIC following an INIT reset is the same as it is after a power-up or hardware reset, except that the APIC ID and arbitration ID registers are not affected." [10.4.7.3: Local APIC State After an INIT Reset ("Wait-for-SIPI" State)] Signed-off-by: Nadav Amit Message-Id: <1428924848-28212-1-git-send-email-namit@cs.technion.ac.il> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++--- arch/x86/kvm/lapic.c | 11 +++++---- arch/x86/kvm/lapic.h | 2 +- arch/x86/kvm/svm.c | 27 +++++++++++----------- arch/x86/kvm/vmx.c | 51 +++++++++++++++++++++++------------------ arch/x86/kvm/x86.c | 17 ++++++++------ 6 files changed, 63 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f80ad591aa61..3a19e30f0be0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -711,7 +711,7 @@ struct kvm_x86_ops { /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); - void (*vcpu_reset)(struct kvm_vcpu *vcpu); + void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); @@ -1001,7 +1001,7 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -int fx_init(struct kvm_vcpu *vcpu); +int fx_init(struct kvm_vcpu *vcpu, bool init_event); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); @@ -1145,7 +1145,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); -void kvm_vcpu_reset(struct kvm_vcpu *vcpu); +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, unsigned long address); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4071eb161c8f..abf165330881 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1557,7 +1557,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } -void kvm_lapic_reset(struct kvm_vcpu *vcpu) +void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic; int i; @@ -1571,7 +1571,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - kvm_apic_set_id(apic, vcpu->vcpu_id); + if (!init_event) + kvm_apic_set_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) @@ -1713,7 +1714,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ - kvm_lapic_reset(vcpu); + kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; @@ -2047,8 +2048,8 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) pe = xchg(&apic->pending_events, 0); if (test_bit(KVM_APIC_INIT, &pe)) { - kvm_lapic_reset(vcpu); - kvm_vcpu_reset(vcpu); + kvm_lapic_reset(vcpu, true); + kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 9d28383fc1e7..793f76183e8b 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); void kvm_apic_accept_events(struct kvm_vcpu *vcpu); -void kvm_lapic_reset(struct kvm_vcpu *vcpu); +void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 46299dac7c6d..b6ad0f9de80a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1082,7 +1082,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } -static void init_vmcb(struct vcpu_svm *svm) +static void init_vmcb(struct vcpu_svm *svm, bool init_event) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1153,17 +1153,17 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_efer(&svm->vcpu, 0); + if (!init_event) + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; /* - * This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. + * It also updates the guest-visible cr0 value. */ - svm->vcpu.arch.cr0 = 0; (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); save->cr4 = X86_CR4_PAE; @@ -1195,13 +1195,19 @@ static void init_vmcb(struct vcpu_svm *svm) enable_gif(svm); } -static void svm_vcpu_reset(struct kvm_vcpu *vcpu) +static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); u32 dummy; u32 eax = 1; - init_vmcb(svm); + if (!init_event) { + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) + svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + init_vmcb(svm, init_event); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1257,12 +1263,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm); - - svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) - svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + init_vmcb(svm, false); svm_init_osvw(&svm->vcpu); @@ -1884,7 +1885,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm); + init_vmcb(svm, false); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7b61687bd79..a694ff6bce80 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4708,22 +4708,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct msr_data apic_base_msr; + u64 cr0; vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - kvm_set_cr8(&vmx->vcpu, 0); - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(&vmx->vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); + kvm_set_cr8(vcpu, 0); + + if (!init_event) { + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(vcpu, &apic_base_msr); + } vmx_segment_cache_clear(vmx); @@ -4747,9 +4752,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); + if (!init_event) { + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + } vmcs_writel(GUEST_RFLAGS, 0x02); kvm_rip_write(vcpu, 0xfff0); @@ -4764,18 +4772,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - /* Special registers */ - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - setup_msrs(vmx); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow()) { + if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vmx->vcpu.kvm)) + if (vm_need_tpr_shadow(vcpu->kvm)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vmx->vcpu.arch.apic->regs)); + __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } @@ -4787,12 +4792,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ - vmx_set_cr4(&vmx->vcpu, 0); - vmx_set_efer(&vmx->vcpu, 0); - vmx_fpu_activate(&vmx->vcpu); - update_exception_bitmap(&vmx->vcpu); + cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + vmx_set_cr0(vcpu, cr0); /* enter rmode */ + vmx->vcpu.arch.cr0 = cr0; + vmx_set_cr4(vcpu, 0); + if (!init_event) + vmx_set_efer(vcpu, 0); + vmx_fpu_activate(vcpu); + update_exception_bitmap(vcpu); vpid_sync_context(vmx); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0435b653f583..755a79fabfc1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7033,7 +7033,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -int fx_init(struct kvm_vcpu *vcpu) +int fx_init(struct kvm_vcpu *vcpu, bool init_event) { int err; @@ -7041,7 +7041,9 @@ int fx_init(struct kvm_vcpu *vcpu) if (err) return err; - fpu_finit(&vcpu->arch.guest_fpu); + if (!init_event) + fpu_finit(&vcpu->arch.guest_fpu); + if (cpu_has_xsaves) vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; @@ -7121,7 +7123,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); if (r) return r; - kvm_vcpu_reset(vcpu); + kvm_vcpu_reset(vcpu, false); kvm_mmu_setup(vcpu); vcpu_put(vcpu); @@ -7159,7 +7161,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -void kvm_vcpu_reset(struct kvm_vcpu *vcpu) +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; @@ -7186,13 +7188,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - kvm_pmu_reset(vcpu); + if (!init_event) + kvm_pmu_reset(vcpu); memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - kvm_x86_ops->vcpu_reset(vcpu); + kvm_x86_ops->vcpu_reset(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -7381,7 +7384,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; } - r = fx_init(vcpu); + r = fx_init(vcpu, false); if (r) goto fail_free_wbinvd_dirty_mask; -- cgit v1.2.1 From b7cb22317305883d50f930cd6bf2fdb37df930c1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Apr 2015 14:57:05 +0200 Subject: KVM: x86: tweak types of fields in kvm_lapic_irq Change to u16 if they only contain data in the low 16 bits. Change the level field to bool, since we assign 1 sometimes, but just mask icr_low with APIC_INT_ASSERT in apic_send_ipi. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 ++++---- arch/x86/kvm/lapic.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3a19e30f0be0..dc83b43d0850 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -689,10 +689,10 @@ struct msr_data { struct kvm_lapic_irq { u32 vector; - u32 delivery_mode; - u32 dest_mode; - u32 level; - u32 trig_mode; + u16 delivery_mode; + u16 dest_mode; + bool level; + u16 trig_mode; u32 shorthand; u32 dest_id; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index abf165330881..ba585d0c42c5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -914,7 +914,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.vector = icr_low & APIC_VECTOR_MASK; irq.delivery_mode = icr_low & APIC_MODE_MASK; irq.dest_mode = icr_low & APIC_DEST_MASK; - irq.level = icr_low & APIC_INT_ASSERT; + irq.level = (icr_low & APIC_INT_ASSERT) != 0; irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; irq.shorthand = icr_low & APIC_SHORT_MASK; if (apic_x2apic_mode(apic)) -- cgit v1.2.1 From 93bbf0b8bc80f0ee3c629542a4dea14a3537760b Mon Sep 17 00:00:00 2001 From: James Sullivan Date: Wed, 18 Mar 2015 19:26:03 -0600 Subject: kvm: x86: Extended struct kvm_lapic_irq with msi_redir_hint for MSI delivery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extended struct kvm_lapic_irq with bool msi_redir_hint, which will be used to determine if the delivery of the MSI should target only the lowest priority CPU in the logical group specified for delivery. (In physical dest mode, the RH bit is not relevant). Initialized the value of msi_redir_hint to true when RH=1 in kvm_set_msi_irq(), and initialized to false in all other cases. Added value of msi_redir_hint to a debug message dump of an IRQ in apic_send_ipi(). Signed-off-by: James Sullivan Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/ioapic.c | 1 + arch/x86/kvm/irq_comm.c | 3 ++- arch/x86/kvm/lapic.c | 6 ++++-- arch/x86/kvm/x86.c | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dc83b43d0850..8b661d1946b5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -695,6 +695,7 @@ struct kvm_lapic_irq { u16 trig_mode; u32 shorthand; u32 dest_id; + bool msi_redir_hint; }; struct kvm_x86_ops { diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 28146f03c514..274663496f7a 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.delivery_mode = entry->fields.delivery_mode << 8; irqe.level = 1; irqe.shorthand = 0; + irqe.msi_redir_hint = false; if (irqe.trig_mode == IOAPIC_EDGE_TRIG) ioapic->irr_delivered |= 1 << irq; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 72298b3ac025..80c10af2bf46 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -106,9 +106,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; irq->delivery_mode = e->msi.data & 0x700; + irq->msi_redir_hint = ((e->msi.address_lo + & MSI_ADDR_REDIRECTION_LOWPRI) > 0); irq->level = 1; irq->shorthand = 0; - /* TODO Deal with RH bit of MSI message address */ } int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba585d0c42c5..2573b29bbbb3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -917,6 +917,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.level = (icr_low & APIC_INT_ASSERT) != 0; irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; irq.shorthand = icr_low & APIC_SHORT_MASK; + irq.msi_redir_hint = false; if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else @@ -926,10 +927,11 @@ static void apic_send_ipi(struct kvm_lapic *apic) apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " - "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", + "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, " + "msi_redir_hint 0x%x\n", icr_high, icr_low, irq.shorthand, irq.dest_id, irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, - irq.vector); + irq.vector, irq.msi_redir_hint); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 755a79fabfc1..bf80ce7656b7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5984,6 +5984,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) lapic_irq.shorthand = 0; lapic_irq.dest_mode = 0; lapic_irq.dest_id = apicid; + lapic_irq.msi_redir_hint = false; lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); -- cgit v1.2.1 From d1ebdbf99a3ce90f3b886c2cf0dfd7da17703d2a Mon Sep 17 00:00:00 2001 From: James Sullivan Date: Wed, 18 Mar 2015 19:26:04 -0600 Subject: kvm: x86: Deliver MSI IRQ to only lowest prio cpu if msi_redir_hint is true MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An MSI interrupt should only be delivered to the lowest priority CPU when it has RH=1, regardless of the delivery mode. Modified kvm_is_dm_lowest_prio() to check for either irq->delivery_mode == APIC_DM_LOWPRI or irq->msi_redir_hint. Moved kvm_is_dm_lowest_prio() into lapic.h and renamed to kvm_lowest_prio_delivery(). Changed a check in kvm_irq_delivery_to_apic_fast() from irq->delivery_mode == APIC_DM_LOWPRI to kvm_is_dm_lowest_prio(). Signed-off-by: James Sullivan Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 11 ++++------- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/lapic.h | 6 ++++++ 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 80c10af2bf46..9efff9e5b58c 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -31,6 +31,8 @@ #include "ioapic.h" +#include "lapic.h" + static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, line_status); } -inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) -{ - return irq->delivery_mode == APIC_DM_LOWEST; -} - int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map) { @@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_vcpu *vcpu, *lowest = NULL; if (irq->dest_mode == 0 && irq->dest_id == 0xff && - kvm_is_dm_lowest_prio(irq)) { + kvm_lowest_prio_delivery(irq)) { printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, irq->dest_id, irq->dest_mode)) continue; - if (!kvm_is_dm_lowest_prio(irq)) { + if (!kvm_lowest_prio_delivery(irq)) { if (r < 0) r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2573b29bbbb3..dc5b57bb1f76 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -728,7 +728,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, dst = map->logical_map[cid]; - if (irq->delivery_mode == APIC_DM_LOWEST) { + if (kvm_lowest_prio_delivery(irq)) { int l = -1; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 793f76183e8b..71b150cae5f9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -153,6 +153,12 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) return vcpu->arch.apic->pending_events; } +static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) +{ + return (irq->delivery_mode == APIC_DM_LOWEST || + irq->msi_redir_hint); +} + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); -- cgit v1.2.1 From 653f52c316a49c5ee2701bc13b15879f20790662 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 23 Apr 2015 11:52:37 -0400 Subject: kvm,x86: load guest FPU context more eagerly Currently KVM will clear the FPU bits in CR0.TS in the VMCS, and trap to re-load them every time the guest accesses the FPU after a switch back into the guest from the host. This patch copies the x86 task switch semantics for FPU loading, with the FPU loaded eagerly after first use if the system uses eager fpu mode, or if the guest uses the FPU frequently. In the latter case, after loading the FPU for 255 times, the fpu_counter will roll over, and we will revert to loading the FPU on demand, until it has been established that the guest is still actively using the FPU. This mirrors the x86 task switch policy, which seems to work. Signed-off-by: Rik van Riel Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf80ce7656b7..c42134e98e31 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7086,14 +7086,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { kvm_put_guest_xcr0(vcpu); - if (!vcpu->guest_fpu_loaded) + if (!vcpu->guest_fpu_loaded) { + vcpu->fpu_counter = 0; return; + } vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + /* + * If using eager FPU mode, or if the guest is a frequent user + * of the FPU, just leave the FPU active for next time. + * Every 255 times fpu_counter rolls over to 0; a guest that uses + * the FPU in bursts will revert to loading it on demand. + */ + if (!use_eager_fpu()) { + if (++vcpu->fpu_counter < 5) + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + } trace_kvm_fpu(0); } -- cgit v1.2.1 From 74545705cb2e398efc3cd751628c58f021323434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Mon, 27 Apr 2015 15:11:25 +0200 Subject: KVM: x86: fix initial PAT value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PAT should be 0007_0406_0007_0406h on RESET and not modified on INIT. VMX used a wrong value (host's PAT) and while SVM used the right one, it never got to arch.pat. This is not an issue with QEMU as it will force the correct value. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 12 ++---------- arch/x86/kvm/x86.c | 2 ++ arch/x86/kvm/x86.h | 2 ++ 4 files changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b6ad0f9de80a..8954d7a07703 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1176,7 +1176,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); - save->g_pat = 0x0007040600070406ULL; + save->g_pat = svm->vcpu.arch.pat; save->cr3 = 0; save->cr4 = 0; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a694ff6bce80..3c218fd59f8c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4667,16 +4667,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - u32 msr_low, msr_high; - u64 host_pat; - rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); - host_pat = msr_low | ((u64) msr_high << 32); - /* Write the default value follow host pat */ - vmcs_write64(GUEST_IA32_PAT, host_pat); - /* Keep arch.pat sync with GUEST_IA32_PAT */ - vmx->vcpu.arch.pat = host_pat; - } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c42134e98e31..cdccbe1749a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7408,6 +7408,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f5fef1868096..01a1d011e073 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -4,6 +4,8 @@ #include #include "kvm_cache_regs.h" +#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL + static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { vcpu->arch.exception.pending = false; -- cgit v1.2.1 From d90e3a35e90af7f8beceebefc90e62eae237c5ed Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 27 Apr 2015 22:35:34 +0200 Subject: KVM: x86: drop unneeded null test If the null test is needed, the call to cancel_delayed_work_sync would have already crashed. Normally, the destroy function should only be called if the init function has succeeded, in which case ioapic is not null. Problem found using Coccinelle. Suggested-by: Michael S. Tsirkin Signed-off-by: Julia Lawall Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 274663496f7a..856f79105bb5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -638,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm) struct kvm_ioapic *ioapic = kvm->arch.vioapic; cancel_delayed_work_sync(&ioapic->eoi_inject); - if (ioapic) { - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); - kvm->arch.vioapic = NULL; - kfree(ioapic); - } + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); } int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) -- cgit v1.2.1 From a3eb97bd80134ba07864ca00747466c02118aca1 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 24 Apr 2015 22:36:14 -0300 Subject: x86: kvmclock: drop rdtsc_barrier() Drop unnecessary rdtsc_barrier(), as has been determined empirically, see 057e6a8c660e95c3f4e7162e00e2fee1fc90c50d for details. Noticed by Andy Lutomirski. Improves clock_gettime() by approximately 15% on Intel i7-3520M @ 2.90GHz. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pvclock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d6b078e9fa28..628954ceede1 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -86,7 +86,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; ret_flags = src->flags; - rdtsc_barrier(); *cycles = ret; *flags = ret_flags; -- cgit v1.2.1 From 4eb64dce8d0a3ceccc0036383e4683b7c3ea7a50 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 30 Apr 2015 12:57:28 +0200 Subject: KVM: x86: dump VMCS on invalid entry Code and format roughly based on Xen's vmcs_dump_vcpu. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3c218fd59f8c..63849a0334a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7690,6 +7690,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) kvm_vcpu_kick(vcpu); } +static void vmx_dump_sel(char *name, uint32_t sel) +{ + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(sel), + vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); +} + +static void vmx_dump_dtsel(char *name, uint32_t limit) +{ + pr_err("%s limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(limit), + vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); +} + +static void dump_vmcs(void) +{ + u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); + u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); + u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + u32 secondary_exec_control = 0; + unsigned long cr4 = vmcs_readl(GUEST_CR4); + u64 efer = vmcs_readl(GUEST_IA32_EFER); + int i, n; + + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + pr_err("*** Guest State ***\n"); + pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), + vmcs_readl(CR0_GUEST_HOST_MASK)); + pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); + pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && + (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) + { + pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n", + vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1)); + pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n", + vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3)); + } + pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", + vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); + pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", + vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(GUEST_SYSENTER_ESP), + vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); + vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); + vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); + vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); + vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); + vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); + vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); + vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); + vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); + vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); + vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); + if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || + (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) + pr_err("EFER = 0x%016llx PAT = 0x%016lx\n", + efer, vmcs_readl(GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n", + vmcs_readl(GUEST_IA32_DEBUGCTL), + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016lx\n", + vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL)); + if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) + pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS)); + pr_err("Interruptibility = %08x ActivityState = %08x\n", + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(GUEST_ACTIVITY_STATE)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + pr_err("InterruptStatus = %04x\n", + vmcs_read16(GUEST_INTR_STATUS)); + + pr_err("*** Host State ***\n"); + pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", + vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", + vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), + vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), + vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), + vmcs_read16(HOST_TR_SELECTOR)); + pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", + vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), + vmcs_readl(HOST_TR_BASE)); + pr_err("GDTBase=%016lx IDTBase=%016lx\n", + vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); + pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", + vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), + vmcs_readl(HOST_CR4)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(HOST_IA32_SYSENTER_ESP), + vmcs_read32(HOST_IA32_SYSENTER_CS), + vmcs_readl(HOST_IA32_SYSENTER_EIP)); + if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) + pr_err("EFER = 0x%016lx PAT = 0x%016lx\n", + vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016lx\n", + vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL)); + + pr_err("*** Control State ***\n"); + pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", + pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); + pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", + vmcs_read32(EXCEPTION_BITMAP), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + pr_err(" reason=%08x qualification=%016lx\n", + vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + pr_err("IDTVectoring: info=%08x errcode=%08x\n", + vmcs_read32(IDT_VECTORING_INFO_FIELD), + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) + pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) + pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) + pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER)); + n = vmcs_read32(CR3_TARGET_COUNT); + for (i = 0; i + 1 < n; i += 4) + pr_err("CR3 target%u=%016lx target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), + i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); + if (i < n) + pr_err("CR3 target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); + if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) + pr_err("PLE Gap=%08x Window=%08x\n", + vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) + pr_err("Virtual processor ID = 0x%04x\n", + vmcs_read16(VIRTUAL_PROCESSOR_ID)); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7722,6 +7874,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; -- cgit v1.2.1 From 8cd161b1f755decd8b7f6c9c7144119281fe11a4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 May 2015 11:38:08 +0200 Subject: x86/traps: Remove superfluous weak definitions and dead code Those were leftovers of the x86 merge, see 081f75bbdc86 ("traps: x86: make traps_32.c and traps_64.c equal") for example and are not needed now. Signed-off-by: Borislav Petkov --- arch/x86/kernel/traps.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 68b1d5979a46..2768bb663f94 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -813,23 +813,6 @@ dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) -smp_deferred_error_interrupt(void) -{ } /* -- cgit v1.2.1 From 3490c0e45f7e5a5b1e5c62e4c60b0e55b2e75e71 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 May 2015 12:06:43 +0200 Subject: x86/mce/amd: Zap changelog It is useless and git history has it all detailed anyway. Update copyright while at it. Signed-off-by: Borislav Petkov Cc: Aravind Gopalakrishnan --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 70e1bf6f784d..e99b15077e94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,21 +1,13 @@ /* - * (c) 2005-2012 Advanced Micro Devices, Inc. + * (c) 2005-2015 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. - * * Maintained by: Borislav Petkov * - * April 2006 - * - added support for AMD Family 0x10 processors - * May 2012 - * - major scrubbing - * May 2015 - * - add support for deferred error interrupts (Aravind Gopalakrishnan) - * - * All MC4_MISCi registers are shared between multi-cores + * All MC4_MISCi registers are shared between cores on a node. */ #include #include -- cgit v1.2.1 From acac6f89574c743796c9c947e046a0d6637d5fe2 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 3 May 2015 20:22:57 +0300 Subject: KVM: x86: Call-far should not be emulated as stack op Far call in 64-bit has a 32-bit operand size. Remove the marking of this operation as Stack so it can be emulated correctly in 64-bit. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 630bcb0d7a04..5839fc56cb3e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3840,7 +3840,7 @@ static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), I(SrcMem | NearBranch, em_call_near_abs), - I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), + I(SrcMemFAddr | ImplicitOps, em_call_far), I(SrcMem | NearBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps, em_jmp_far), I(SrcMem | Stack, em_push), D(Undefined), -- cgit v1.2.1 From 8a9781f7ad0087182c51c629335803264568ef3b Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 4 May 2015 08:32:32 +0200 Subject: KVM: nVMX: Fix host crash when loading MSRs with userspace irqchip vcpu->arch.apic is NULL when a userspace irqchip is active. But instead of letting the test incorrectly depend on in-kernel irqchip mode, open-code it to catch also userspace x2APICs. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 63849a0334a6..5e8d41bccc26 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2170,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_nested; - else if (irqchip_in_kernel(vcpu->kvm) && - apic_x2apic_mode(vcpu->arch.apic)) { + else if (vcpu->arch.apic_base & X2APIC_ENABLE) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -9076,7 +9075,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { /* x2APIC MSR accesses are not allowed */ - if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) + if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) return -EINVAL; if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ e->index == MSR_IA32_UCODE_REV) -- cgit v1.2.1 From ceee7df749da03fa17ab5dc7d06dffb4980a32db Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 7 May 2015 16:20:15 +0800 Subject: KVM: MMU: fix smap permission check Current permission check assumes that RSVD bit in PFEC is always zero, however, it is not true since MMIO #PF will use it to quickly identify MMIO access Fix it by clearing the bit if walking guest page table is needed Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 2 ++ arch/x86/kvm/paging_tmpl.h | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c7d65637c851..06eb2fc1bab8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -166,6 +166,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + WARN_ON(pfec & PFERR_RSVD_MASK); + return (mmu->permissions[index] >> pte_access) & 1; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index fd49c867b25a..6e6d115fe9b5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -718,6 +718,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, mmu_is_nested(vcpu)); if (likely(r != RET_MMIO_PF_INVALID)) return r; + + /* + * page fault with PFEC.RSVD = 1 is caused by shadow + * page fault, should not be used to walk guest page + * table. + */ + error_code &= ~PFERR_RSVD_MASK; }; r = mmu_topup_memory_caches(vcpu); -- cgit v1.2.1 From 31fd9880a1c52de8880ea49dca7848caacb4b3a3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 2 Apr 2015 11:04:05 +0200 Subject: KVM: MMU: fix CR4.SMEP=1, CR0.WP=0 with shadow pages smep_andnot_wp is initialized in kvm_init_shadow_mmu and shadow pages should not be reused for different values of it. Thus, it has to be added to the mask in kvm_mmu_pte_write. Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d43867c33bc4..209fe1477465 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4238,7 +4238,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; + mask.cr0_wp = mask.cr4_pae = mask.nxe = mask.smep_andnot_wp = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { -- cgit v1.2.1 From dde74f2e4a4447ef838c57e407f7139de3df68cb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 27 Apr 2015 15:21:51 +0200 Subject: x86/asm/entry/64: Tidy up JZ insns after TESTs After TESTs, use logically correct JZ/JNZ mnemonics instead of JE/JNE. This doesn't change code. Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1430140912-7960-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e952f6bf1d6d..8f8b22a361df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -666,7 +666,7 @@ END(irq_entries_start) leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ testl $3, CS-RBP(%rsp) - je 1f + jz 1f SWAPGS 1: /* @@ -721,7 +721,7 @@ ret_from_intr: CFI_ADJUST_CFA_OFFSET RBP testl $3,CS(%rsp) - je retint_kernel + jz retint_kernel /* Interrupt came from user space */ GET_THREAD_INFO(%rcx) @@ -1310,7 +1310,7 @@ ENTRY(error_entry) SAVE_EXTRA_REGS 8 xorl %ebx,%ebx testl $3,CS+8(%rsp) - je error_kernelspace + jz error_kernelspace error_swapgs: SWAPGS error_sti: @@ -1361,7 +1361,7 @@ ENTRY(error_exit) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax - jne retint_kernel + jnz retint_kernel LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi -- cgit v1.2.1 From 03335e95e27fc1f2b17b05b27342ad76986b3cf0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 27 Apr 2015 15:21:52 +0200 Subject: x86/asm/entry/64: Clean up usage of TEST insns By the nature of TEST operation, it is often possible to test a narrower part of the operand: "testl $3, mem" -> "testb $3, mem" This results in shorter insns, because TEST insn has no sign-entending byte-immediate forms unlike other ALU ops. text data bss dec hex filename 11674 0 0 11674 2d9a entry_64.o.before 11658 0 0 11658 2d8a entry_64.o Changes in object code: - f7 84 24 88 00 00 00 03 00 00 00 testl $0x3,0x88(%rsp) + f6 84 24 88 00 00 00 03 testb $0x3,0x88(%rsp) - f7 44 24 68 03 00 00 00 testl $0x3,0x68(%rsp) + f6 44 24 68 03 testb $0x3,0x68(%rsp) - f7 84 24 90 00 00 00 03 00 00 00 testl $0x3,0x90(%rsp) + f6 84 24 90 00 00 00 03 testb $0x3,0x90(%rsp) Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1430140912-7960-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8f8b22a361df..60705b032521 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -601,7 +601,7 @@ ENTRY(ret_from_fork) RESTORE_EXTRA_REGS - testl $3,CS(%rsp) # from kernel_thread? + testb $3, CS(%rsp) # from kernel_thread? /* * By the time we get here, we have no idea whether our pt_regs, @@ -665,7 +665,7 @@ END(irq_entries_start) leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - testl $3, CS-RBP(%rsp) + testb $3, CS-RBP(%rsp) jz 1f SWAPGS 1: @@ -720,7 +720,7 @@ ret_from_intr: CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP - testl $3,CS(%rsp) + testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ @@ -968,7 +968,7 @@ ENTRY(\sym) .if \paranoid .if \paranoid == 1 CFI_REMEMBER_STATE - testl $3, CS(%rsp) /* If coming from userspace, switch */ + testb $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif call paranoid_entry @@ -1309,7 +1309,7 @@ ENTRY(error_entry) SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 xorl %ebx,%ebx - testl $3,CS+8(%rsp) + testb $3, CS+8(%rsp) jz error_kernelspace error_swapgs: SWAPGS @@ -1606,7 +1606,6 @@ end_repeat_nmi: je 1f movq %r12, %cr2 1: - testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: -- cgit v1.2.1 From d73a33973f16ab6703e75ea00edee857afa3406e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 24 Apr 2015 14:56:31 -0400 Subject: locking/qspinlock, x86: Enable x86-64 to use queued spinlocks This patch makes the necessary changes at the x86 architecture specific layer to enable the use of queued spinlocks for x86-64. As x86-32 machines are typically not multi-socket. The benefit of queue spinlock may not be apparent. So queued spinlocks are not enabled. Currently, there is some incompatibilities between the para-virtualized spinlock code (which hard-codes the use of ticket spinlock) and the queued spinlocks. Therefore, the use of queued spinlocks is disabled when the para-virtualized spinlock is enabled. The arch/x86/include/asm/qspinlock.h header file includes some x86 specific optimization which will make the queueds spinlock code perform better than the generic implementation. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Daniel J Blueman Cc: David Vrabel Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Rik van Riel Cc: Scott J Norton Cc: Thomas Gleixner Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1429901803-29771-3-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/qspinlock.h | 20 ++++++++++++++++++++ arch/x86/include/asm/spinlock.h | 5 +++++ arch/x86/include/asm/spinlock_types.h | 4 ++++ 4 files changed, 30 insertions(+) create mode 100644 arch/x86/include/asm/qspinlock.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..90b1b54f4f38 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,6 +127,7 @@ config X86 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_QUEUED_SPINLOCK select ARCH_USE_QUEUE_RWLOCK select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select OLD_SIGACTION if X86_32 diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h new file mode 100644 index 000000000000..e2aee8273664 --- /dev/null +++ b/arch/x86/include/asm/qspinlock.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_QSPINLOCK_H +#define _ASM_X86_QSPINLOCK_H + +#include + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + * + * A smp_store_release() on the least-significant byte. + */ +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release((u8 *)lock, 0); +} + +#include + +#endif /* _ASM_X86_QSPINLOCK_H */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 64b611782ef0..4ec5413156ca 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -42,6 +42,10 @@ extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); +#ifdef CONFIG_QUEUED_SPINLOCK +#include +#else + #ifdef CONFIG_PARAVIRT_SPINLOCKS static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) @@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) cpu_relax(); } } +#endif /* CONFIG_QUEUED_SPINLOCK */ /* * Read-write spinlocks, allowing multiple readers diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 5f9d7572d82b..5df1f1b9a4b0 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -23,6 +23,9 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) +#ifdef CONFIG_QUEUED_SPINLOCK +#include +#else typedef struct arch_spinlock { union { __ticketpair_t head_tail; @@ -33,6 +36,7 @@ typedef struct arch_spinlock { } arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } +#endif /* CONFIG_QUEUED_SPINLOCK */ #include -- cgit v1.2.1 From 2aa79af64263190eec610422b07f60e99a7d230a Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Fri, 24 Apr 2015 14:56:36 -0400 Subject: locking/qspinlock: Revert to test-and-set on hypervisors When we detect a hypervisor (!paravirt, see qspinlock paravirt support patches), revert to a simple test-and-set lock to avoid the horrors of queue preemption. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Daniel J Blueman Cc: David Vrabel Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Rik van Riel Cc: Scott J Norton Cc: Thomas Gleixner Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1429901803-29771-8-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/qspinlock.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index e2aee8273664..f079b7020e3f 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_QSPINLOCK_H #define _ASM_X86_QSPINLOCK_H +#include #include #define queued_spin_unlock queued_spin_unlock @@ -15,6 +16,19 @@ static inline void queued_spin_unlock(struct qspinlock *lock) smp_store_release((u8 *)lock, 0); } +#define virt_queued_spin_lock virt_queued_spin_lock + +static inline bool virt_queued_spin_lock(struct qspinlock *lock) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0) + cpu_relax(); + + return true; +} + #include #endif /* _ASM_X86_QSPINLOCK_H */ -- cgit v1.2.1 From f233f7f1581e78fd9b4023f2e7d8c1ed89020cc9 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Fri, 24 Apr 2015 14:56:38 -0400 Subject: locking/pvqspinlock, x86: Implement the paravirt qspinlock call patching We use the regular paravirt call patching to switch between: native_queued_spin_lock_slowpath() __pv_queued_spin_lock_slowpath() native_queued_spin_unlock() __pv_queued_spin_unlock() We use a callee saved call for the unlock function which reduces the i-cache footprint and allows 'inlining' of SPIN_UNLOCK functions again. We further optimize the unlock path by patching the direct call with a "movb $0,%arg1" if we are indeed using the native unlock code. This makes the unlock code almost as fast as the !PARAVIRT case. This significantly lowers the overhead of having CONFIG_PARAVIRT_SPINLOCKS enabled, even for native code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Daniel J Blueman Cc: David Vrabel Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Rik van Riel Cc: Scott J Norton Cc: Thomas Gleixner Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1429901803-29771-10-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/paravirt.h | 29 ++++++++++++++++++++++++++++- arch/x86/include/asm/paravirt_types.h | 10 ++++++++++ arch/x86/include/asm/qspinlock.h | 25 ++++++++++++++++++++++++- arch/x86/include/asm/qspinlock_paravirt.h | 6 ++++++ arch/x86/kernel/paravirt-spinlocks.c | 24 +++++++++++++++++++++++- arch/x86/kernel/paravirt_patch_32.c | 22 ++++++++++++++++++---- arch/x86/kernel/paravirt_patch_64.c | 22 ++++++++++++++++++---- 8 files changed, 128 insertions(+), 12 deletions(-) create mode 100644 arch/x86/include/asm/qspinlock_paravirt.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 90b1b54f4f38..50ec043a920d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -667,7 +667,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP - select UNINLINE_SPIN_UNLOCK + select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCK ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 8957810ad7d1..266c35381b62 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#ifdef CONFIG_QUEUED_SPINLOCK + +static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, + u32 val) +{ + PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val); +} + +static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) +{ + PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock); +} + +static __always_inline void pv_wait(u8 *ptr, u8 val) +{ + PVOP_VCALL2(pv_lock_ops.wait, ptr, val); +} + +static __always_inline void pv_kick(int cpu) +{ + PVOP_VCALL1(pv_lock_ops.kick, cpu); +} + +#else /* !CONFIG_QUEUED_SPINLOCK */ + static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { @@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } -#endif +#endif /* CONFIG_QUEUED_SPINLOCK */ + +#endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index f7b0b5c112f2..76cd68426af8 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -333,9 +333,19 @@ struct arch_spinlock; typedef u16 __ticket_t; #endif +struct qspinlock; + struct pv_lock_ops { +#ifdef CONFIG_QUEUED_SPINLOCK + void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); + struct paravirt_callee_save queued_spin_unlock; + + void (*wait)(u8 *ptr, u8 val); + void (*kick)(int cpu); +#else /* !CONFIG_QUEUED_SPINLOCK */ struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); +#endif /* !CONFIG_QUEUED_SPINLOCK */ }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index f079b7020e3f..9d51fae1cba3 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -3,6 +3,7 @@ #include #include +#include #define queued_spin_unlock queued_spin_unlock /** @@ -11,11 +12,33 @@ * * A smp_store_release() on the least-significant byte. */ -static inline void queued_spin_unlock(struct qspinlock *lock) +static inline void native_queued_spin_unlock(struct qspinlock *lock) { smp_store_release((u8 *)lock, 0); } +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); + +static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + pv_queued_spin_lock_slowpath(lock, val); +} + +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + pv_queued_spin_unlock(lock); +} +#else +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} +#endif + #define virt_queued_spin_lock virt_queued_spin_lock static inline bool virt_queued_spin_lock(struct qspinlock *lock) diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h new file mode 100644 index 000000000000..b002e711ba88 --- /dev/null +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -0,0 +1,6 @@ +#ifndef __ASM_QSPINLOCK_PARAVIRT_H +#define __ASM_QSPINLOCK_PARAVIRT_H + +PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); + +#endif diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index bbb6c7316341..a33f1eb15003 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,11 +8,33 @@ #include +#ifdef CONFIG_QUEUED_SPINLOCK +__visible void __native_queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} + +PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); + +bool pv_is_native_spin_unlock(void) +{ + return pv_lock_ops.queued_spin_unlock.func == + __raw_callee_save___native_queued_spin_unlock; +} +#endif + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP +#ifdef CONFIG_QUEUED_SPINLOCK + .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), + .wait = paravirt_nop, + .kick = paravirt_nop, +#else /* !CONFIG_QUEUED_SPINLOCK */ .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), .unlock_kick = paravirt_nop, -#endif +#endif /* !CONFIG_QUEUED_SPINLOCK */ +#endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index d9f32e6d6ab6..e1b013696dde 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { /* arg in %eax, return in %eax */ @@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) return 0; } +extern bool pv_is_native_spin_unlock(void); + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { @@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_cpu_ops, read_tsc); - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): + if (pv_is_native_spin_unlock()) { + start = start_pv_lock_ops_queued_spin_unlock; + end = end_pv_lock_ops_queued_spin_unlock; + goto patch_site; + } +#endif default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; + +patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; } #undef PATCH_SITE return ret; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index a1da6737ba5b..e0fb41c8255b 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) start__mov64, end__mov64); } +extern bool pv_is_native_spin_unlock(void); + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { @@ -59,14 +65,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCK) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): + if (pv_is_native_spin_unlock()) { + start = start_pv_lock_ops_queued_spin_unlock; + end = end_pv_lock_ops_queued_spin_unlock; + goto patch_site; + } +#endif default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; + +patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; } #undef PATCH_SITE return ret; -- cgit v1.2.1 From bf0c7c34adc286bec3a5a38c00c773ba1b2d0396 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 24 Apr 2015 14:56:39 -0400 Subject: locking/pvqspinlock, x86: Enable PV qspinlock for KVM This patch adds the necessary KVM specific code to allow KVM to support the CPU halting and kicking operations needed by the queue spinlock PV code. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Daniel J Blueman Cc: David Vrabel Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Rik van Riel Cc: Scott J Norton Cc: Thomas Gleixner Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1429901803-29771-11-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9435620062df..6c21d931bd24 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu) kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } + +#ifdef CONFIG_QUEUED_SPINLOCK + +#include + +static void kvm_wait(u8 *ptr, u8 val) +{ + unsigned long flags; + + if (in_nmi()) + return; + + local_irq_save(flags); + + if (READ_ONCE(*ptr) != val) + goto out; + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + local_irq_restore(flags); +} + +#else /* !CONFIG_QUEUED_SPINLOCK */ + enum kvm_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) } } +#endif /* !CONFIG_QUEUED_SPINLOCK */ + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; +#ifdef CONFIG_QUEUED_SPINLOCK + __pv_init_lock_hash(); + pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_lock_ops.wait = kvm_wait; + pv_lock_ops.kick = kvm_kick_cpu; +#else /* !CONFIG_QUEUED_SPINLOCK */ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); pv_lock_ops.unlock_kick = kvm_unlock_kick; +#endif } static __init int kvm_spinlock_init_jump(void) -- cgit v1.2.1 From e95e6f176c61dd0e7bd9fdfb4956df1f9bfe99d4 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 24 Apr 2015 14:56:40 -0400 Subject: locking/pvqspinlock, x86: Enable PV qspinlock for Xen This patch adds the necessary Xen specific code to allow Xen to support the CPU halting and kicking operations needed by the queue spinlock PV code. Signed-off-by: David Vrabel Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Daniel J Blueman Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Rik van Riel Cc: Scott J Norton Cc: Thomas Gleixner Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1429901803-29771-12-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 64 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 956374c1edbc..af907a90fb19 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,6 +17,56 @@ #include "xen-ops.h" #include "debugfs.h" +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(char *, irq_name); +static bool xen_pvspin = true; + +#ifdef CONFIG_QUEUED_SPINLOCK + +#include + +static void xen_qlock_kick(int cpu) +{ + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); +} + +/* + * Halt the current CPU & release it back to the host + */ +static void xen_qlock_wait(u8 *byte, u8 val) +{ + int irq = __this_cpu_read(lock_kicker_irq); + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return; + + /* clear pending */ + xen_clear_irq_pending(irq); + barrier(); + + /* + * We check the byte value after clearing pending IRQ to make sure + * that we won't miss a wakeup event because of the clearing. + * + * The sync_clear_bit() call in xen_clear_irq_pending() is atomic. + * So it is effectively a memory barrier for x86. + */ + if (READ_ONCE(*byte) != val) + return; + + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ + + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); +} + +#else /* CONFIG_QUEUED_SPINLOCK */ + enum xen_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -100,12 +150,9 @@ struct xen_lock_waiting { __ticket_t want; }; -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); static cpumask_t waiting_cpus; -static bool xen_pvspin = true; __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); @@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) } } } +#endif /* CONFIG_QUEUED_SPINLOCK */ static irqreturn_t dummy_handler(int irq, void *dev_id) { @@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void) return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); +#ifdef CONFIG_QUEUED_SPINLOCK + __pv_init_lock_hash(); + pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_lock_ops.wait = xen_qlock_wait; + pv_lock_ops.kick = xen_qlock_kick; +#else pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; +#endif } /* @@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg) } early_param("xen_nopvspin", xen_parse_nopvspin); -#ifdef CONFIG_XEN_DEBUG_FS +#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCK) static struct dentry *d_spin_debug; -- cgit v1.2.1 From 562bfca4c80175d1d18eef5c3f4bb8dda53c03e4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 May 2015 12:43:53 +0200 Subject: x86/mm: Clean up types in xlate_dev_mem_ptr() some more So Linus noticed that in: 94d4b4765b7d ("x86/mm: Clean up types in xlate_dev_mem_ptr()") ... I added two nonsensical casts, due to the poor type choice for 'vaddr'. Change it to 'void *' and take advantage of void * arithmetics. This removes the casts. ( Also remove a nonsensical return line from unxlate_dev_mem_ptr() while at it. ) Suggested-by: Linus Torvalds Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 70e7444c6835..27ff21216dfa 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -353,18 +353,18 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) { unsigned long start = phys & PAGE_MASK; unsigned long offset = phys & ~PAGE_MASK; - unsigned long vaddr; + void *vaddr; /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE); + vaddr = ioremap_cache(start, PAGE_SIZE); /* Only add the offset on success and return NULL if the ioremap() failed: */ if (vaddr) vaddr += offset; - return (void *)vaddr; + return vaddr; } void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) @@ -373,7 +373,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) return; iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); - return; } static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; -- cgit v1.2.1 From 2a4e90b18c256d52a7f3f77d58114f6d4e4a7f9f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 8 May 2015 12:26:02 +0200 Subject: x86: Force inlining of atomic ops With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined: $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn 473 000000000000000b t spin_unlock_irqrestore 449 000000000000005f t rcu_read_unlock 355 0000000000000009 t atomic_inc <== THIS 353 000000000000006e t rcu_read_lock 350 0000000000000075 t rcu_read_lock_sched_held 291 000000000000000b t spin_unlock 266 0000000000000019 t arch_local_irq_restore 215 000000000000000b t spin_lock 180 0000000000000011 t kzalloc 165 0000000000000012 t list_add_tail 161 0000000000000019 t arch_local_save_flags 153 0000000000000016 t test_and_set_bit 134 000000000000000b t spin_unlock_irq 134 0000000000000009 t atomic_dec <== THIS 130 000000000000000b t spin_unlock_bh 122 0000000000000010 t brelse 120 0000000000000016 t test_and_clear_bit 120 000000000000000b t spin_lock_irq 119 000000000000001e t get_dma_ops 117 0000000000000053 t cpumask_next 116 0000000000000036 t kref_get 114 000000000000001a t schedule_work 106 000000000000000b t spin_lock_bh 103 0000000000000019 t arch_local_irq_disable ... Note sizes of marked functions. They are merely 9 bytes long! Selecting function with 'atomic' in their names: 355 0000000000000009 t atomic_inc 134 0000000000000009 t atomic_dec 98 0000000000000014 t atomic_dec_and_test 31 000000000000000e t atomic_add_return 27 000000000000000a t atomic64_inc 26 000000000000002f t kmap_atomic 24 0000000000000009 t atomic_add 12 0000000000000009 t atomic_sub 10 0000000000000021 t __atomic_add_unless 10 000000000000000a t atomic64_add 5 000000000000001f t __atomic_add_unless.constprop.7 5 000000000000000a t atomic64_dec 4 000000000000001f t __atomic_add_unless.constprop.18 4 000000000000001f t __atomic_add_unless.constprop.12 4 000000000000001f t __atomic_add_unless.constprop.10 3 000000000000001f t __atomic_add_unless.constprop.13 3 0000000000000011 t atomic64_add_return 2 000000000000001f t __atomic_add_unless.constprop.9 2 000000000000001f t __atomic_add_unless.constprop.8 2 000000000000001f t __atomic_add_unless.constprop.6 2 000000000000001f t __atomic_add_unless.constprop.5 2 000000000000001f t __atomic_add_unless.constprop.3 2 000000000000001f t __atomic_add_unless.constprop.22 2 000000000000001f t __atomic_add_unless.constprop.14 2 000000000000001f t __atomic_add_unless.constprop.11 2 000000000000001e t atomic_dec_if_positive 2 0000000000000014 t atomic_inc_and_test 2 0000000000000011 t atomic_add_return.constprop.4 2 0000000000000011 t atomic_add_return.constprop.17 2 0000000000000011 t atomic_add_return.constprop.16 2 000000000000000d t atomic_inc.constprop.4 2 000000000000000c t atomic_cmpxchg This patch fixes this for x86 atomic ops via s/inline/__always_inline/. This decreases allyesconfig kernel by about 25k: text data bss dec hex filename 82399481 22255416 20627456 125282353 777a831 vmlinux.before 82375570 22255544 20627456 125258570 7774b4a vmlinux Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1431080762-17797-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 75a9ee8529f3..e9168955c42f 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -22,7 +22,7 @@ * * Atomically reads the value of @v. */ -static inline int atomic_read(const atomic_t *v) +static __always_inline int atomic_read(const atomic_t *v) { return ACCESS_ONCE((v)->counter); } @@ -34,7 +34,7 @@ static inline int atomic_read(const atomic_t *v) * * Atomically sets the value of @v to @i. */ -static inline void atomic_set(atomic_t *v, int i) +static __always_inline void atomic_set(atomic_t *v, int i) { v->counter = i; } @@ -126,7 +126,7 @@ static __always_inline int atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic_inc_and_test(atomic_t *v) +static __always_inline int atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); } @@ -140,7 +140,7 @@ static inline int atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int atomic_add_negative(int i, atomic_t *v) +static __always_inline int atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s"); } @@ -164,7 +164,7 @@ static __always_inline int atomic_add_return(int i, atomic_t *v) * * Atomically subtracts @i from @v and returns @v - @i */ -static inline int atomic_sub_return(int i, atomic_t *v) +static __always_inline int atomic_sub_return(int i, atomic_t *v) { return atomic_add_return(-i, v); } @@ -172,7 +172,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); } @@ -213,7 +213,7 @@ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) * Atomically adds 1 to @v * Returns the new value of @u */ -static inline short int atomic_inc_short(short int *v) +static __always_inline short int atomic_inc_short(short int *v) { asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); return *v; -- cgit v1.2.1 From c5c19941ad1bb18f010ae47f1db333c00b276d55 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 8 May 2015 13:25:45 +0300 Subject: x86/kconfig: Bump default NR_CPUS from 8 to 64 for 64-bit configuration Default NR_CPUS==8 is not enough to cover high-end desktop configuration: Haswell-E has upto 16 threads. Let's increase default NR_CPUS to 64 on 64-bit configuration. With this value CPU bitmask will still fit into one unsigned long. Default for 32-bit configuration is still 8: it's unlikely anybody will run 32-bit kernels on modern hardware. As an alternative we could bump NR_CPUS to 128 to cover all dual-processor servers with some margin. For reference: Debian and Suse build their kernels with NR_CPUS==512, Fedora -- 1024. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431080745-19792-1-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..83cd1c7aa409 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -851,7 +851,8 @@ config NR_CPUS default "1" if !SMP default "8192" if MAXSMP default "32" if SMP && X86_BIGSMP - default "8" if SMP + default "8" if SMP && X86_32 + default "64" if SMP ---help--- This allows you to specify the maximum number of CPUs which this kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum -- cgit v1.2.1 From cad14bb9f8ef8bed42c3118adc0d9756e2aeeaa1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 8 May 2015 13:25:26 +0300 Subject: x86/kconfig: Fix the CONFIG_NR_CPUS description Since: b53b5eda8194 ("x86/cpu: Increase max CPU count to 8192") ... the maximum supported NR_CPUS for CPUMASK_OFFSTACK case is 8192. Let's adjust the description to reflect the change. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431080726-2490-1-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 83cd1c7aa409..c3333e5be5d7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -856,7 +856,7 @@ config NR_CPUS ---help--- This allows you to specify the maximum number of CPUs which this kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum - supported value is 4096, otherwise the maximum value is 512. The + supported value is 8192, otherwise the maximum value is 512. The minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds -- cgit v1.2.1 From 63332a8455d8310b77d38779c6c21a660a8d9feb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:33 +0200 Subject: x86/entry: Stop using PER_CPU_VAR(kernel_stack) PER_CPU_VAR(kernel_stack) is redundant: - On the 64-bit build, we can use PER_CPU_VAR(cpu_tss + TSS_sp0). - On the 32-bit build, we can use PER_CPU_VAR(cpu_current_top_of_stack). PER_CPU_VAR(kernel_stack) will be deleted by a separate change. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/thread_info.h | 8 +++++++- arch/x86/kernel/entry_64.S | 2 +- arch/x86/xen/xen-asm_64.S | 5 +++-- 4 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 2ab0f7182df3..1b1330c07971 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -346,7 +346,7 @@ ENTRY(ia32_cstar_target) SWAPGS_UNSAFE_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 - movq PER_CPU_VAR(kernel_stack),%rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b4bdec3e9523..d656a363e1eb 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -198,9 +198,15 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ /* Load thread_info address into "reg" */ +#ifdef CONFIG_X86_32 #define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ _ASM_SUB $(THREAD_SIZE),reg ; +#else +#define GET_THREAD_INFO(reg) \ + _ASM_MOV PER_CPU_VAR(cpu_tss + TSS_sp0),reg ; \ + _ASM_SUB $(THREAD_SIZE),reg ; +#endif /* * ASM operand which evaluates to a 'thread_info' address of diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7423e3e2f5c5..c13b86b40176 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -216,7 +216,7 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack),%rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp /* Construct struct pt_regs on stack */ pushq_cfi $__USER_DS /* pt_regs->ss */ diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index a2cabb8bd6bf..5aa7ec607b9e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -53,7 +54,7 @@ ENTRY(xen_sysret64) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack), %rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp pushq $__USER_DS pushq PER_CPU_VAR(rsp_scratch) @@ -72,7 +73,7 @@ ENTRY(xen_sysret32) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack), %rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp pushq $__USER32_DS pushq PER_CPU_VAR(rsp_scratch) -- cgit v1.2.1 From fed7c3f0f750f225317828d691e9eb76eec887b3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:34 +0200 Subject: x86/entry: Remove unused 'kernel_stack' per-cpu variable Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 2 -- arch/x86/kernel/cpu/common.c | 4 ---- arch/x86/kernel/process_32.c | 5 +---- arch/x86/kernel/process_64.c | 3 --- arch/x86/kernel/smpboot.c | 2 -- 5 files changed, 1 insertion(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d656a363e1eb..472288962c99 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -177,8 +177,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -DECLARE_PER_CPU(unsigned long, kernel_stack); - static inline struct thread_info *current_thread_info(void) { return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a62cf04dac8a..6bec0b55863e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1155,10 +1155,6 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); -DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); - #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8ed2106b06da..a99900cedc22 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -302,13 +302,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ddfdbf74f174..82134506faa8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -409,9 +409,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE); - /* * Now maybe reload the debug registers and handle I/O bitmaps */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..023cccf5a4ae 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -792,8 +792,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; } /* -- cgit v1.2.1 From 3a23208e69679597e767cf3547b1a30dd845d9b5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:35 +0200 Subject: x86/entry: Define 'cpu_current_top_of_stack' for 64-bit code 32-bit code has PER_CPU_VAR(cpu_current_top_of_stack). 64-bit code uses somewhat more obscure: PER_CPU_VAR(cpu_tss + TSS_sp0). Define the 'cpu_current_top_of_stack' macro on CONFIG_X86_64 as well so that the PER_CPU_VAR(cpu_current_top_of_stack) expression can be used in both 32-bit and 64-bit code. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 4 ++-- arch/x86/include/asm/thread_info.h | 10 ++++------ arch/x86/kernel/entry_64.S | 2 +- arch/x86/xen/xen-asm_64.S | 5 +++-- 4 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 1b1330c07971..63450a596800 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -113,7 +113,7 @@ ENTRY(ia32_sysenter_target) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ @@ -346,7 +346,7 @@ ENTRY(ia32_cstar_target) SWAPGS_UNSAFE_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 - movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 472288962c99..225ee545e1a0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -195,16 +195,14 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ +#ifdef CONFIG_X86_64 +# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +#endif + /* Load thread_info address into "reg" */ -#ifdef CONFIG_X86_32 #define GET_THREAD_INFO(reg) \ _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ _ASM_SUB $(THREAD_SIZE),reg ; -#else -#define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(cpu_tss + TSS_sp0),reg ; \ - _ASM_SUB $(THREAD_SIZE),reg ; -#endif /* * ASM operand which evaluates to a 'thread_info' address of diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c13b86b40176..09c3f9e0e07e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -216,7 +216,7 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp /* Construct struct pt_regs on stack */ pushq_cfi $__USER_DS /* pt_regs->ss */ diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 5aa7ec607b9e..04529e620559 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -54,7 +55,7 @@ ENTRY(xen_sysret64) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER_DS pushq PER_CPU_VAR(rsp_scratch) @@ -73,7 +74,7 @@ ENTRY(xen_sysret32) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER32_DS pushq PER_CPU_VAR(rsp_scratch) -- cgit v1.2.1 From c5bde906d2916d214d78cd8b67d665bf09867033 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:50 -0400 Subject: x86/irq: Merge irq_regs & irq_stat Move irq_regs and irq_stat definitions to irq.c. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 6 ++++++ arch/x86/kernel/irq_32.c | 6 ------ arch/x86/kernel/irq_64.c | 6 ------ 3 files changed, 6 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e5952c225532..fe2ed8bb507b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,12 @@ #define CREATE_TRACE_POINTS #include +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index f9fd86a7fcc7..cd74f5978ab9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -21,12 +21,6 @@ #include -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - #ifdef CONFIG_DEBUG_STACKOVERFLOW int sysctl_panic_on_stackoverflow __read_mostly; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 394e643d7830..bc4604e500a3 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,12 +20,6 @@ #include #include -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - int sysctl_panic_on_stackoverflow; /* -- cgit v1.2.1 From c6e692f95dacddff5f3607717fb2246c60bbb714 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:51 -0400 Subject: x86/asm/entry/irq: Remove unused invalidate_interrupt prototypes The invalidate_interrupt* functions no longer exist. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-3-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e9571ddabc4f..014c6382ffce 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -36,40 +36,6 @@ extern asmlinkage void spurious_interrupt(void); extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); -extern asmlinkage void invalidate_interrupt(void); -extern asmlinkage void invalidate_interrupt0(void); -extern asmlinkage void invalidate_interrupt1(void); -extern asmlinkage void invalidate_interrupt2(void); -extern asmlinkage void invalidate_interrupt3(void); -extern asmlinkage void invalidate_interrupt4(void); -extern asmlinkage void invalidate_interrupt5(void); -extern asmlinkage void invalidate_interrupt6(void); -extern asmlinkage void invalidate_interrupt7(void); -extern asmlinkage void invalidate_interrupt8(void); -extern asmlinkage void invalidate_interrupt9(void); -extern asmlinkage void invalidate_interrupt10(void); -extern asmlinkage void invalidate_interrupt11(void); -extern asmlinkage void invalidate_interrupt12(void); -extern asmlinkage void invalidate_interrupt13(void); -extern asmlinkage void invalidate_interrupt14(void); -extern asmlinkage void invalidate_interrupt15(void); -extern asmlinkage void invalidate_interrupt16(void); -extern asmlinkage void invalidate_interrupt17(void); -extern asmlinkage void invalidate_interrupt18(void); -extern asmlinkage void invalidate_interrupt19(void); -extern asmlinkage void invalidate_interrupt20(void); -extern asmlinkage void invalidate_interrupt21(void); -extern asmlinkage void invalidate_interrupt22(void); -extern asmlinkage void invalidate_interrupt23(void); -extern asmlinkage void invalidate_interrupt24(void); -extern asmlinkage void invalidate_interrupt25(void); -extern asmlinkage void invalidate_interrupt26(void); -extern asmlinkage void invalidate_interrupt27(void); -extern asmlinkage void invalidate_interrupt28(void); -extern asmlinkage void invalidate_interrupt29(void); -extern asmlinkage void invalidate_interrupt30(void); -extern asmlinkage void invalidate_interrupt31(void); - extern asmlinkage void irq_move_cleanup_interrupt(void); extern asmlinkage void reboot_interrupt(void); extern asmlinkage void threshold_interrupt(void); @@ -178,7 +144,6 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void); extern __visible void smp_reschedule_interrupt(struct pt_regs *); extern __visible void smp_call_function_interrupt(struct pt_regs *); extern __visible void smp_call_function_single_interrupt(struct pt_regs *); -extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif extern char irq_entries_start[]; -- cgit v1.2.1 From 51bb92843edcba5a58138cad25ced97923048add Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:52 -0400 Subject: x86/asm/entry: Remove SYSCALL_VECTOR Use IA32_SYSCALL_VECTOR for both compat and native. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 3 --- arch/x86/kernel/traps.c | 4 ++-- arch/x86/lguest/boot.c | 4 ++-- 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 666c89ec4bd7..07f27926d473 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -47,9 +47,6 @@ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR #define IA32_SYSCALL_VECTOR 0x80 -#ifdef CONFIG_X86_32 -# define SYSCALL_VECTOR 0x80 -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 324ab5247687..5e0791f9d3dc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -997,8 +997,8 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - set_bit(SYSCALL_VECTOR, used_vectors); + set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif /* diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 8f9a133cc099..cab9aaa7802c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -90,7 +90,7 @@ struct lguest_data lguest_data = { .noirq_iret = (u32)lguest_noirq_iret, .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = SYSCALL_VECTOR, + .syscall_vec = IA32_SYSCALL_VECTOR, }; /*G:037 @@ -866,7 +866,7 @@ static void __init lguest_init_IRQ(void) for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { /* Some systems map "vectors" to interrupts weirdly. Not us! */ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); - if (i != SYSCALL_VECTOR) + if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR)); } -- cgit v1.2.1 From 8b455e6577f325289cf2d1b20f493b2fe5c6c316 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:53 -0400 Subject: x86/asm/entry/irq: Clean up IRQn_VECTOR macros Since the ISA irqs are in a single block, use ISA_IRQ_VECTOR(irq) instead of individual macros. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-5-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 18 +----------------- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/i8259.c | 8 ++++---- arch/x86/kernel/irqinit.c | 4 ++-- 5 files changed, 10 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 07f27926d473..117db96ad5fb 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -52,23 +52,7 @@ * Vectors 0x30-0x3f are used for ISA interrupts. * round up to the next 16-vector boundary */ -#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) - -#define IRQ1_VECTOR (IRQ0_VECTOR + 1) -#define IRQ2_VECTOR (IRQ0_VECTOR + 2) -#define IRQ3_VECTOR (IRQ0_VECTOR + 3) -#define IRQ4_VECTOR (IRQ0_VECTOR + 4) -#define IRQ5_VECTOR (IRQ0_VECTOR + 5) -#define IRQ6_VECTOR (IRQ0_VECTOR + 6) -#define IRQ7_VECTOR (IRQ0_VECTOR + 7) -#define IRQ8_VECTOR (IRQ0_VECTOR + 8) -#define IRQ9_VECTOR (IRQ0_VECTOR + 9) -#define IRQ10_VECTOR (IRQ0_VECTOR + 10) -#define IRQ11_VECTOR (IRQ0_VECTOR + 11) -#define IRQ12_VECTOR (IRQ0_VECTOR + 12) -#define IRQ13_VECTOR (IRQ0_VECTOR + 13) -#define IRQ14_VECTOR (IRQ0_VECTOR + 14) -#define IRQ15_VECTOR (IRQ0_VECTOR + 15) +#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) /* * Special IRQ vectors used by the SMP architecture, 0xf0-0xff diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f4dc2462a1ac..e01e4117188a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -258,11 +258,11 @@ int __init arch_early_ioapic_init(void) /* * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + * ISA_IRQ_VECTOR(irq) for all cpu's. */ for (i = 0; i < nr_legacy_irqs(); i++) { cfg = alloc_irq_and_cfg_at(i, node); - cfg->vector = IRQ0_VECTOR + i; + cfg->vector = ISA_IRQ_VECTOR(i); cpumask_setall(cfg->domain); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6cedd7914581..82d44c314a3f 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -314,7 +314,7 @@ void setup_vector_irq(int cpu) * legacy vector to irq mapping: */ for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; + per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq; __setup_vector_irq(cpu); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index e7cc5370cd2f..16cb827a5b27 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi) */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ + outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); @@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi) outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ + outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..dc1e08d23552 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -86,7 +86,7 @@ void __init init_IRQ(void) int i; /* - * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If * these IRQ's are handled by more mordern controllers like IO-APIC, @@ -94,7 +94,7 @@ void __init init_IRQ(void) * irq's migrate etc. */ for (i = 0; i < nr_legacy_irqs(); i++) - per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i; x86_init.irqs.intr_init(); } -- cgit v1.2.1 From 62c7a1e9ae54ef66658df9614bdbc09cbbdaa6f0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 11 May 2015 09:47:23 +0200 Subject: locking/pvqspinlock: Rename QUEUED_SPINLOCK to QUEUED_SPINLOCKS Valentin Rothberg reported that we use CONFIG_QUEUED_SPINLOCKS in arch/x86/kernel/paravirt_patch_32.c, while the symbol is called CONFIG_QUEUED_SPINLOCK. (Note the extra 'S') But the typo was natural: the proper English term for such a generic object would be 'queued spinlocks' - so rename this and related symbols accordingly to the plural form. Reported-by: Valentin Rothberg Cc: Douglas Hatch Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++-- arch/x86/include/asm/paravirt.h | 6 +++--- arch/x86/include/asm/paravirt_types.h | 6 +++--- arch/x86/include/asm/spinlock.h | 4 ++-- arch/x86/include/asm/spinlock_types.h | 4 ++-- arch/x86/kernel/kvm.c | 10 +++++----- arch/x86/kernel/paravirt-spinlocks.c | 8 ++++---- arch/x86/kernel/paravirt_patch_64.c | 4 ++-- arch/x86/xen/spinlock.c | 10 +++++----- 9 files changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 50ec043a920d..f8dc6abbe6ae 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,7 +127,7 @@ config X86 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_QUEUED_SPINLOCK + select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_QUEUE_RWLOCK select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select OLD_SIGACTION if X86_32 @@ -667,7 +667,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP - select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCK + select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 266c35381b62..d143bfad45d7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,7 +712,7 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) @@ -735,7 +735,7 @@ static __always_inline void pv_kick(int cpu) PVOP_VCALL1(pv_lock_ops.kick, cpu); } -#else /* !CONFIG_QUEUED_SPINLOCK */ +#else /* !CONFIG_QUEUED_SPINLOCKS */ static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) @@ -749,7 +749,7 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } -#endif /* CONFIG_QUEUED_SPINLOCK */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ #endif /* SMP && PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 76cd68426af8..8766c7c395c2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -336,16 +336,16 @@ typedef u16 __ticket_t; struct qspinlock; struct pv_lock_ops { -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); struct paravirt_callee_save queued_spin_unlock; void (*wait)(u8 *ptr, u8 val); void (*kick)(int cpu); -#else /* !CONFIG_QUEUED_SPINLOCK */ +#else /* !CONFIG_QUEUED_SPINLOCKS */ struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); -#endif /* !CONFIG_QUEUED_SPINLOCK */ +#endif /* !CONFIG_QUEUED_SPINLOCKS */ }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 4ec5413156ca..be0a05913b91 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -42,7 +42,7 @@ extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS #include #else @@ -200,7 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) cpu_relax(); } } -#endif /* CONFIG_QUEUED_SPINLOCK */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ /* * Read-write spinlocks, allowing multiple readers diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 5df1f1b9a4b0..65c3e37f879a 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -23,7 +23,7 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS #include #else typedef struct arch_spinlock { @@ -36,7 +36,7 @@ typedef struct arch_spinlock { } arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } -#endif /* CONFIG_QUEUED_SPINLOCK */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ #include diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6c21d931bd24..1681504e44a4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -585,7 +585,7 @@ static void kvm_kick_cpu(int cpu) } -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS #include @@ -615,7 +615,7 @@ out: local_irq_restore(flags); } -#else /* !CONFIG_QUEUED_SPINLOCK */ +#else /* !CONFIG_QUEUED_SPINLOCKS */ enum kvm_contention_stat { TAKEN_SLOW, @@ -850,7 +850,7 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) } } -#endif /* !CONFIG_QUEUED_SPINLOCK */ +#endif /* !CONFIG_QUEUED_SPINLOCKS */ /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. @@ -863,13 +863,13 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; -#else /* !CONFIG_QUEUED_SPINLOCK */ +#else /* !CONFIG_QUEUED_SPINLOCKS */ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); pv_lock_ops.unlock_kick = kvm_unlock_kick; #endif diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index a33f1eb15003..33ee3e0efd65 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,7 +8,7 @@ #include -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); @@ -25,15 +25,15 @@ bool pv_is_native_spin_unlock(void) struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, -#else /* !CONFIG_QUEUED_SPINLOCK */ +#else /* !CONFIG_QUEUED_SPINLOCKS */ .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), .unlock_kick = paravirt_nop, -#endif /* !CONFIG_QUEUED_SPINLOCK */ +#endif /* !CONFIG_QUEUED_SPINLOCKS */ #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index e0fb41c8255b..a1fa86782186 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -21,7 +21,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCK) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); #endif @@ -65,7 +65,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCK) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index af907a90fb19..9e2ba5c6e1dd 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -21,7 +21,7 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); static bool xen_pvspin = true; -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS #include @@ -65,7 +65,7 @@ static void xen_qlock_wait(u8 *byte, u8 val) xen_poll_irq(irq); } -#else /* CONFIG_QUEUED_SPINLOCK */ +#else /* CONFIG_QUEUED_SPINLOCKS */ enum xen_contention_stat { TAKEN_SLOW, @@ -264,7 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) } } } -#endif /* CONFIG_QUEUED_SPINLOCK */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ static irqreturn_t dummy_handler(int irq, void *dev_id) { @@ -328,7 +328,7 @@ void __init xen_init_spinlocks(void) return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); -#ifdef CONFIG_QUEUED_SPINLOCK +#ifdef CONFIG_QUEUED_SPINLOCKS __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); @@ -366,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg) } early_param("xen_nopvspin", xen_parse_nopvspin); -#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCK) +#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS) static struct dentry *d_spin_debug; -- cgit v1.2.1 From f21262b8e092a770e39fbd405cc18a0247c3af68 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 11 May 2015 10:15:46 +0200 Subject: x86/alternatives: Switch AMD F15h and later to the P6 NOPs Software optimization guides for both F15h and F16h cite those NOPs as the optimal ones. A microbenchmark confirms that actually even older families are better with the single-insn NOPs so switch to them for the alternatives. Cycles count below includes the loop overhead of the measurement but that overhead is the same with all runs. F10h, revE: ----------- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 288.212282 cycles 66 90 288.220840 cycles 66 66 90 288.219447 cycles 66 66 66 90 288.223204 cycles 66 66 90 66 90 571.393424 cycles 66 66 90 66 66 90 571.374919 cycles 66 66 66 90 66 66 90 572.249281 cycles 66 66 66 90 66 66 66 90 571.388651 cycles P6: 90 288.214193 cycles 66 90 288.225550 cycles 0f 1f 00 288.224441 cycles 0f 1f 40 00 288.225030 cycles 0f 1f 44 00 00 288.233558 cycles 66 0f 1f 44 00 00 324.792342 cycles 0f 1f 80 00 00 00 00 325.657462 cycles 0f 1f 84 00 00 00 00 00 430.246643 cycles F14h: ---- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 510.404890 cycles 66 90 510.432117 cycles 66 66 90 510.561858 cycles 66 66 66 90 510.541865 cycles 66 66 90 66 90 1014.192782 cycles 66 66 90 66 66 90 1014.226546 cycles 66 66 66 90 66 66 90 1014.334299 cycles 66 66 66 90 66 66 66 90 1014.381205 cycles P6: 90 510.436710 cycles 66 90 510.448229 cycles 0f 1f 00 510.545100 cycles 0f 1f 40 00 510.502792 cycles 0f 1f 44 00 00 510.589517 cycles 66 0f 1f 44 00 00 510.611462 cycles 0f 1f 80 00 00 00 00 511.166794 cycles 0f 1f 84 00 00 00 00 00 511.651641 cycles F15h: ----- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 243.128396 cycles 66 90 243.129883 cycles 66 66 90 243.131631 cycles 66 66 66 90 242.499324 cycles 66 66 90 66 90 481.829083 cycles 66 66 90 66 66 90 481.884413 cycles 66 66 66 90 66 66 90 481.851446 cycles 66 66 66 90 66 66 66 90 481.409220 cycles P6: 90 243.127026 cycles 66 90 243.130711 cycles 0f 1f 00 243.122747 cycles 0f 1f 40 00 242.497617 cycles 0f 1f 44 00 00 245.354461 cycles 66 0f 1f 44 00 00 361.930417 cycles 0f 1f 80 00 00 00 00 362.844944 cycles 0f 1f 84 00 00 00 00 00 480.514948 cycles F16h: ----- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 507.793298 cycles 66 90 507.789636 cycles 66 66 90 507.826490 cycles 66 66 66 90 507.859075 cycles 66 66 90 66 90 1008.663129 cycles 66 66 90 66 66 90 1008.696259 cycles 66 66 66 90 66 66 90 1008.692517 cycles 66 66 66 90 66 66 66 90 1008.755399 cycles P6: 90 507.795232 cycles 66 90 507.794761 cycles 0f 1f 00 507.834901 cycles 0f 1f 40 00 507.822629 cycles 0f 1f 44 00 00 507.838493 cycles 66 0f 1f 44 00 00 507.908597 cycles 0f 1f 80 00 00 00 00 507.946417 cycles 0f 1f 84 00 00 00 00 00 507.954960 cycles Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431332153-18566-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index aef653193160..b0932c4341b3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -227,6 +227,15 @@ void __init arch_init_ideal_nops(void) #endif } break; + + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 > 0xf) { + ideal_nops = p6_nops; + return; + } + + /* fall through */ + default: #ifdef CONFIG_X86_64 ideal_nops = k8_nops; -- cgit v1.2.1 From 6b44e72a1c45d1a4e903af75611235a2d6ea25e3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 11 May 2015 10:15:47 +0200 Subject: x86/cpu/microcode: Zap changelog It is useless at best and git history has it all detailed anyway. Update copyright while at it. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431332153-18566-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/core.c | 76 +++++------------------------------ arch/x86/kernel/cpu/microcode/intel.c | 75 ++++------------------------------ 2 files changed, 16 insertions(+), 135 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 36a83617eb21..6236a54a63f4 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -1,74 +1,16 @@ /* - * Intel CPU Microcode Update Driver for Linux + * CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian - * 2006 Shaohua Li + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * 2013-2015 Borislav Petkov * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. + * This driver allows to upgrade microcode on x86 processors. * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman for the fix. - * 1.08 11 Dec 2000, Richard Schaal and - * Tigran Aivazian - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick and - * Tigran Aivazian , - * Serialize updates as required on HT processors due to - * speculative nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble , - * Jun Nakajima - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index a41beadb3db9..e20d4e58cd89 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -1,74 +1,13 @@ /* - * Intel CPU Microcode Update Driver for Linux + * Intel CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian - * 2006 Shaohua Li + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman for the fix. - * 1.08 11 Dec 2000, Richard Schaal and - * Tigran Aivazian - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick and - * Tigran Aivazian , - * Serialize updates as required on HT processors due to - * speculative nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble , - * Jun Nakajima - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -- cgit v1.2.1 From 6c434d6176c0cb42847c33245189667d645db7bf Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 11 May 2015 10:15:49 +0200 Subject: x86/mm: Do not flush last cacheline twice in clflush_cache_range() The current algorithm used in clflush_cache_range() can cause the last cache line of the buffer to be flushed twice. Fix that algorithm so that each cache line will only be flushed once. Reported-by: H. Peter Anvin Signed-off-by: Ross Zwisler Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/1430259192-18802-1-git-send-email-ross.zwisler@linux.intel.com Link: http://lkml.kernel.org/r/1431332153-18566-5-git-send-email-bp@alien8.de [ Changed it to 'void *' to simplify the type conversions. ] Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 89af288ec674..5ddd9005f6c3 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -129,16 +129,15 @@ within(unsigned long addr, unsigned long start, unsigned long end) */ void clflush_cache_range(void *vaddr, unsigned int size) { - void *vend = vaddr + size - 1; + unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; mb(); - for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) - clflushopt(vaddr); - /* - * Flush any possible final partial cacheline: - */ - clflushopt(vend); + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += boot_cpu_data.x86_clflush_size) + clflushopt(p); mb(); } -- cgit v1.2.1 From ca7d9b795e6bc78c80a1771ada867994fabcfc01 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 11 May 2015 10:15:51 +0200 Subject: x86/mm: Add kerneldoc comments for pcommit_sfence() Add kerneldoc comments for pcommit_sfence() describing the purpose of the PCOMMIT instruction and demonstrating its usage with an example. Signed-off-by: Ross Zwisler Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H Peter Anvin Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/1430261196-2401-1-git-send-email-ross.zwisler@linux.intel.com Link: http://lkml.kernel.org/r/1431332153-18566-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/special_insns.h | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aeb4666e0c0a..2270e41b32fd 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } +/** + * pcommit_sfence() - persistent commit and fence + * + * The PCOMMIT instruction ensures that data that has been flushed from the + * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to + * memory and is durable on the DIMM. The primary use case for this is + * persistent memory. + * + * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT + * with appropriate fencing. + * + * Example: + * void flush_and_commit_buffer(void *vaddr, unsigned int size) + * { + * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + * void *vend = vaddr + size; + * void *p; + * + * for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + * p < vend; p += boot_cpu_data.x86_clflush_size) + * clwb(p); + * + * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes + * // MFENCE via mb() also works + * wmb(); + * + * // PCOMMIT and the required SFENCE for ordering + * pcommit_sfence(); + * } + * + * After this function completes the data pointed to by 'vaddr' has been + * accepted to memory and will be durable if the 'vaddr' points to persistent + * memory. + * + * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify + * things we include both the PCOMMIT and the required SFENCE in the + * alternatives generated by pcommit_sfence(). + */ static inline void pcommit_sfence(void) { alternative(ASM_NOP7, -- cgit v1.2.1 From cd2f6a5a4704a359635eb34919317052e6a96ba7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 11 May 2015 10:15:52 +0200 Subject: x86/mm/mtrr: Remove incorrect address check in __mtrr_type_lookup() __mtrr_type_lookup() checks MTRR fixed ranges when mtrr_state.have_fixed is set and start is less than 0x100000. However, the 'else if (start < 0x1000000)' in the code checks with an incorrect address as it has an extra-zero in the address. The code still runs correctly as this check is meaningless, though. This patch replaces the incorrect address check with 'else' with no condition. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1427234921-19737-4-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1431332153-18566-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d74f7b3c6ba..5b239679cfc9 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -137,7 +137,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) idx = 1 * 8; idx += ((start - 0x80000) >> 14); return mtrr_state.fixed_ranges[idx]; - } else if (start < 0x1000000) { + } else { idx = 3 * 8; idx += ((start - 0xC0000) >> 12); return mtrr_state.fixed_ranges[idx]; -- cgit v1.2.1 From e4b6be33c28923d8cde53023e0888b1c5d1a9027 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 11 May 2015 10:15:53 +0200 Subject: x86/mm: Add ioremap_uc() helper to map memory uncacheable (not UC-) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioremap_nocache() currently uses UC- by default. Our goal is to eventually make UC the default. Linux maps UC- to PCD=1, PWT=0 page attributes on non-PAT systems. Linux maps UC to PCD=1, PWT=1 page attributes on non-PAT systems. On non-PAT and PAT systems a WC MTRR has different effects on pages with either of these attributes. In order to help with a smooth transition its best to enable use of UC (PCD,1, PWT=1) on a region as that ensures a WC MTRR will have no effect on a region, this however requires us to have an way to declare a region as UC and we currently do not have a way to do this. WC MTRR on non-PAT system with PCD=1, PWT=0 (UC-) yields WC. WC MTRR on non-PAT system with PCD=1, PWT=1 (UC) yields UC. WC MTRR on PAT system with PCD=1, PWT=0 (UC-) yields WC. WC MTRR on PAT system with PCD=1, PWT=1 (UC) yields UC. A flip of the default ioremap_nocache() behaviour from UC- to UC can therefore regress a memory region from effective memory type WC to UC if MTRRs are used. Use of MTRRs should be phased out and in the best case only arch_phys_wc_add() use will remain, even if this happens arch_phys_wc_add() will have an effect on non-PAT systems and changes to default ioremap_nocache() behaviour could regress drivers. Now, ideally we'd use ioremap_nocache() on the regions in which we'd need uncachable memory types and avoid any MTRRs on those regions. There are however some restrictions on MTRRs use, such as the requirement of having the base and size of variable sized MTRRs to be powers of two, which could mean having to use a WC MTRR over a large area which includes a region in which write-combining effects are undesirable. Add ioremap_uc() to help with the both phasing out of MTRR use and also provide a way to blacklist small WC undesirable regions in devices with mixed regions which are size-implicated to use large WC MTRRs. Use of ioremap_uc() helps phase out MTRR use by avoiding regressions with an eventual flip of default behaviour or ioremap_nocache() from UC- to UC. Drivers working with WC MTRRs can use the below table to review and consider the use of ioremap*() and similar helpers to ensure appropriate behaviour long term even if default ioremap_nocache() behaviour changes from UC- to UC. Although ioremap_uc() is being added we leave set_memory_uc() to use UC- as only initial memory type setup is required to be able to accommodate existing device drivers and phase out MTRR use. It should also be clarified that set_memory_uc() cannot be used with IO memory, even though its use will not return any errors, it really has no effect. ---------------------------------------------------------------------- MTRR Non-PAT PAT Linux ioremap value Effective memory type ---------------------------------------------------------------------- Non-PAT | PAT PAT |PCD ||PWT ||| WC 000 WB _PAGE_CACHE_MODE_WB WC | WC WC 001 WC _PAGE_CACHE_MODE_WC WC* | WC WC 010 UC- _PAGE_CACHE_MODE_UC_MINUS WC* | WC WC 011 UC _PAGE_CACHE_MODE_UC UC | UC ---------------------------------------------------------------------- Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Acked-by: H. Peter Anvin Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: Jean-Christophe Plagniol-Villard Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Travis Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Thierry Reding Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Toshi Kani Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: Will Deacon Cc: linux-fbdev@vger.kernel.org Link: http://lkml.kernel.org/r/1430343851-967-2-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1431332153-18566-9-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 1 + arch/x86/mm/ioremap.c | 36 +++++++++++++++++++++++++++++++++++- arch/x86/mm/pageattr.c | 3 +++ 3 files changed, 39 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 34a5b93704d3..4afc05ffa566 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -177,6 +177,7 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * look at pci_iomap(). */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 70e7444c6835..a493bb83aa89 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -237,7 +237,8 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use - * UC MINUS. + * UC MINUS. Drivers that are certain they need or can already + * be converted over to strong UC can use ioremap_uc(). */ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; @@ -246,6 +247,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) } EXPORT_SYMBOL(ioremap_nocache); +/** + * ioremap_uc - map bus memory into CPU space as strongly uncachable + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * ioremap_uc performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked with a strong + * preference as completely uncachable on the CPU when possible. For non-PAT + * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT + * systems this will set the PAT entry for the pages as strong UC. This call + * will honor existing caching rules from things like the PCI bus. Note that + * there are other caches and buffers on many busses. In particular driver + * authors should read up on PCI writes. + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; + + return __ioremap_caller(phys_addr, size, pcm, + __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(ioremap_uc); + /** * ioremap_wc - map memory into CPU space write combined * @phys_addr: bus address of the memory diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 5ddd9005f6c3..c77abd7f92a2 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1467,6 +1467,9 @@ int _set_memory_uc(unsigned long addr, int numpages) { /* * for now UC MINUS. see comments in ioremap_nocache() + * If you really need strong UC use ioremap_uc(), but note + * that you cannot override IO areas with set_memory_*() as + * these helpers cannot work with IO memory. */ return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), -- cgit v1.2.1 From 1fcb61c52bbdbbc46d132acf7dab9ad0eca433fe Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 23 Apr 2015 01:07:08 -0700 Subject: x86/mm/pageattr: Remove an unused variable in slow_virt_to_phys() The patch doesn't change any logic. Signed-off-by: Dexuan Cui Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429776428-4475-1-git-send-email-decui@microsoft.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c77abd7f92a2..397838eb292b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -417,13 +417,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr) phys_addr_t phys_addr; unsigned long offset; enum pg_level level; - unsigned long psize; unsigned long pmask; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); - psize = page_level_size(level); pmask = page_level_mask(level); offset = virt_addr & ~pmask; phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; -- cgit v1.2.1 From a41f3c8cd4e28dcbebd8ec27a9602c86cfa5f009 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 23 Apr 2015 08:56:42 +0200 Subject: perf/x86/intel/uncore: Add Broadwell-U uncore IMC PMU support This patch enables the uncore Memory Controller (IMC) PMU support for Intel Broadwell-U (Model 61) mobile processors. The IMC PMU enables measuring memory bandwidth. To use with perf: $ perf stat -a -I 1000 -e uncore_imc/data_reads/,uncore_imc/data_writes/ sleep 10 Tested-by: Sonny Rao Signed-off-by: Stephane Eranian Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: kan.liang@intel.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20150423065642.GA4890@thinkpad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 3 +++ arch/x86/kernel/cpu/perf_event_intel_uncore.h | 1 + arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c635b8b49e93..a03f96402dbe 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -922,6 +922,9 @@ static int __init uncore_pci_init(void) case 69: /* Haswell Celeron */ ret = hsw_uncore_pci_init(); break; + case 61: /* Broadwell */ + ret = bdw_uncore_pci_init(); + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 6c8c1e7e69d8..06b07930e48b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -326,6 +326,7 @@ extern struct event_constraint uncore_constraint_empty; int snb_uncore_pci_init(void); int ivb_uncore_pci_init(void); int hsw_uncore_pci_init(void); +int bdw_uncore_pci_init(void); void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index 4562e9e22c60..b005a78c7012 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -7,6 +7,7 @@ #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 +#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -486,6 +487,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = { { /* end: all zeroes */ }, }; +static const struct pci_device_id bdw_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + static struct pci_driver snb_uncore_pci_driver = { .name = "snb_uncore", .id_table = snb_uncore_pci_ids, @@ -501,6 +510,11 @@ static struct pci_driver hsw_uncore_pci_driver = { .id_table = hsw_uncore_pci_ids, }; +static struct pci_driver bdw_uncore_pci_driver = { + .name = "bdw_uncore", + .id_table = bdw_uncore_pci_ids, +}; + struct imc_uncore_pci_dev { __u32 pci_id; struct pci_driver *driver; @@ -514,6 +528,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ + IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ { /* end marker */ } }; @@ -561,6 +576,11 @@ int hsw_uncore_pci_init(void) return imc_uncore_pci_init(); } +int bdw_uncore_pci_init(void) +{ + return imc_uncore_pci_init(); +} + /* end of Sandy Bridge uncore support */ /* Nehalem uncore support */ -- cgit v1.2.1 From d68921f9bd148359e6d01c84aaa2e32bfbd82970 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 11 May 2015 17:27:09 -0400 Subject: x86/smp/boot: Add cmdline "cpu_init_udelay=N" to specify cpu_up() delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No change to default behavior. Replace the hard-coded mdelay(10) in cpu_up() with a variable udelay, that is set to a defined default -- rather than a magic number. Add a boot-time override, "cpu_init_udelay=N" Signed-off-by: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2fe8e6c798e8def271122f62df9bbf58dc283e2a.1431379433.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 51203f60587f..0629a8e513af 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -513,6 +513,27 @@ void __inquire_remote_apic(int apicid) } } +/* + * The Multiprocessor Specification 1.4 (1997) example code suggests + * that there should be a 10ms delay between the BSP asserting INIT + * and de-asserting INIT, when starting a remote processor. + * But that slows boot and resume on modern processors, which include + * many cores and don't require that delay. + * + * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + */ +#define UDELAY_10MS_DEFAULT 10000 + +static unsigned int init_udelay = UDELAY_10MS_DEFAULT; + +static int __init cpu_init_udelay(char *str) +{ + get_option(&str, &init_udelay); + + return 0; +} +early_param("cpu_init_udelay", cpu_init_udelay); + /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this @@ -584,7 +605,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); - mdelay(10); + mdelay(init_udelay); pr_debug("Deasserting INIT\n"); -- cgit v1.2.1 From 1a744cb356c57303fc97eb15a298032170f841fa Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 11 May 2015 17:27:10 -0400 Subject: x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modern processor familes do not require the 10ms delay in cpu_up() to de-assert INIT. This speeds up boot and resume by 10ms per (application) processor. Signed-off-by: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/021ce30c88f216ad39686646421194dc25671e55.1431379433.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0629a8e513af..85bd6aad8c74 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -521,6 +521,7 @@ void __inquire_remote_apic(int apicid) * many cores and don't require that delay. * * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + * Modern processor families are quirked to remove the delay entirely. */ #define UDELAY_10MS_DEFAULT 10000 @@ -534,6 +535,18 @@ static int __init cpu_init_udelay(char *str) } early_param("cpu_init_udelay", cpu_init_udelay); +static void __init smp_quirk_init_udelay(void) +{ + /* if cmdline changed it from default, leave it alone */ + if (init_udelay != UDELAY_10MS_DEFAULT) + return; + + /* if modern processor, use no delay */ + if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + init_udelay = 0; +} + /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this @@ -1210,6 +1223,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); set_mtrr_aps_delayed_init(); + + smp_quirk_init_udelay(); } void arch_enable_nonboot_cpus_begin(void) -- cgit v1.2.1 From c7114b4e6c53111d415485875725b60213ffc675 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 11 May 2015 13:57:11 -0400 Subject: locking/qrwlock: Rename QUEUE_RWLOCK to QUEUED_RWLOCKS To be consistent with the queued spinlocks which use CONFIG_QUEUED_SPINLOCKS config parameter, the one for the queued rwlocks is now renamed to CONFIG_QUEUED_RWLOCKS. Signed-off-by: Waiman Long Cc: Borislav Petkov Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431367031-36697-1-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f8dc6abbe6ae..4e986e809861 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -128,7 +128,7 @@ config X86 select CLONE_BACKWARDS if X86_32 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_SPINLOCKS - select ARCH_USE_QUEUE_RWLOCK + select ARCH_USE_QUEUED_RWLOCKS select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select OLD_SIGACTION if X86_32 select COMPAT_OLD_SIGACTION if IA32_EMULATION -- cgit v1.2.1 From a5a2b4da012d3ed92476b169a66e17507abe1a48 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 11 May 2015 17:48:04 +0800 Subject: crypto: aesni - Use crypto_aead_set_reqsize helper This patch uses the crypto_aead_set_reqsize helper to avoid directly touching the internals of aead. Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 112cefacf2af..2ade24000246 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -807,8 +807,9 @@ static int rfc4106_init(struct crypto_tfm *tfm) child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child); memcpy(child_ctx, ctx, sizeof(*ctx)); ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_aead.reqsize = sizeof(struct aead_request) - + crypto_aead_reqsize(&cryptd_tfm->base); + crypto_aead_set_reqsize(__crypto_aead_cast(tfm), + sizeof(struct aead_request) + + crypto_aead_reqsize(&cryptd_tfm->base)); return 0; } -- cgit v1.2.1 From 853b160aaafbe27d6304c8832bb7340d57c6b04e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 May 2015 08:40:49 +0200 Subject: Revert f5d6a52f5111 ("x86/smpboot: Skip delays during SMP initialization similar to Xen") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Huang Ying reported x86 boot hangs due to this commit. Turns out that the change, despite its changelog, does more than just change timeouts: it also changes the way we assert/deassert INIT via the APIC_DM_INIT IPI, in the x2apic case it skips the deassert step. This is historically fragile code and the patch did not improve it, so revert these changes. This commit: 1a744cb356c5 ("x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors") independently removes the worst of the delays (the 10 msec delay). The remaining delays can be addressed one by one, combined with careful testing. Reported-by: Huang Ying Cc: Anthony Liguori Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Gang Wei Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Len Brown Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Deegan Link: http://lkml.kernel.org/r/1430732554-7294-1-git-send-email-jschoenh@amazon.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 58 ++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85bd6aad8c74..b9aaa3930b2f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -614,34 +614,22 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); - if (!cpu_has_x2apic) { - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mdelay(init_udelay); + mdelay(init_udelay); - pr_debug("Deasserting INIT\n"); + pr_debug("Deasserting INIT\n"); - /* Target chip */ - /* Send IPI */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + /* Target chip */ + /* Send IPI */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mb(); - atomic_set(&init_deasserted, 1); - } else if (tboot_enabled()) { - /* - * With tboot AP is actually spinning in a mini-guest before - * receiving INIT. Upon receiving INIT ipi, AP need time to - * VMExit, update VMCS to tracking SIPIs and VMResume. - * - * While AP is in root mode handling the INIT the CPU will drop - * any SIPIs - */ - udelay(10); - } + mb(); + atomic_set(&init_deasserted, 1); /* * Should we send STARTUP IPIs ? @@ -683,22 +671,20 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); - if (!cpu_has_x2apic) { - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); - pr_debug("Startup point 1\n"); + pr_debug("Startup point 1\n"); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); - } + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); -- cgit v1.2.1 From 4a00c95dcdba45c9592af2e908c0816fd54f5544 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 11 May 2015 18:56:49 +0900 Subject: x86/hpet: Pass proper pointer to irq_alloc_info Fix the following oops: hpet_msi_get_hwirq+0x1f/0x27 msi_domain_alloc+0x35/0xfe ? trace_hardirqs_on_caller+0x16c/0x188 irq_domain_alloc_irqs_recursive+0x51/0x95 __irq_domain_alloc_irqs+0x151/0x223 hpet_assign_irq+0x5d/0x68 hpet_msi_capability_lookup+0x121/0x1cb ? hpet_enable+0x2b4/0x2b4 hpet_late_init+0x5f/0xf2 ? hpet_enable+0x2b4/0x2b4 do_one_initcall+0x184/0x199 kernel_init_freeable+0x1af/0x237 ? rest_init+0x13a/0x13a kernel_init+0xe/0xd4 ret_from_fork+0x3f/0x70 ? rest_init+0x13a/0x13a Since 3cb96f0c9733 ('x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains') hpet_msi_capability_lookup() uses hpet_assign_irq(). The latter initializes irq_alloc_info on stack, but passes a NULL pointer to irq_domain_alloc_irqs(), which causes a NULL pointer dereference later in hpet_msi_get_hwirq(). Pass the pointer to the irq_alloc_info irq_domain_alloc_irqs(). Fixes: 3cb96f0c9733 'x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains' Signed-off-by: Sergey Senozhatsky Reviewed-by: Jiang Liu Cc: Sergey Senozhatsky Link: http://lkml.kernel.org/r/20150512041444.GA1094@swordfish Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 58fde664e7c0..ef516afa20bb 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -351,6 +351,6 @@ int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, info.hpet_id = hpet_dev_id(domain); info.hpet_index = dev_num; - return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, NULL); + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); } #endif -- cgit v1.2.1 From 486ca539caa082c7f2929c207af1b3ce2a304489 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 7 May 2015 10:53:56 +0800 Subject: x86, irq: Allocate CPU vectors from device local CPUs if possible On NUMA systems, an IO device may be associated with a NUMA node. It may improve IO performance to allocate resources, such as memory and interrupts, from device local node. This patch introduces a mechanism to support CPU vector allocation policies. It tries to allocate CPU vectors from CPUs on device local node first, and then fallback to all online(global) CPUs. This mechanism may be used to support NumaConnect systems to allocate CPU vectors from device local node. Signed-off-by: Jiang Liu Tested-by: Daniel J Blueman Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1430967244-28905-1-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 2766747e1a3b..b590c9d6736a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -210,6 +210,18 @@ static int assign_irq_vector(int irq, struct apic_chip_data *data, return err; } +static int assign_irq_vector_policy(int irq, int node, + struct apic_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->mask) + return assign_irq_vector(irq, data, info->mask); + if (node != NUMA_NO_NODE && + assign_irq_vector(irq, data, cpumask_of_node(node)) == 0) + return 0; + return assign_irq_vector(irq, data, apic->target_cpus()); +} + static void clear_irq_vector(int irq, struct apic_chip_data *data) { int cpu, vector; @@ -258,12 +270,6 @@ void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) memset(dst, 0, sizeof(*dst)); } -static inline const struct cpumask * -irq_alloc_info_get_mask(struct irq_alloc_info *info) -{ - return (!info || !info->mask) ? apic->target_cpus() : info->mask; -} - static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { @@ -289,7 +295,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, { struct irq_alloc_info *info = arg; struct apic_chip_data *data; - const struct cpumask *mask; struct irq_data *irq_data; int i, err; @@ -300,7 +305,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) return -ENOSYS; - mask = irq_alloc_info_get_mask(info); for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); @@ -318,7 +322,8 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector(virq, data, mask); + err = assign_irq_vector_policy(virq, irq_data->node, data, + info); if (err) goto error; } -- cgit v1.2.1 From a22e5f579b98f16e24b7184d01c35de26eb5a7f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 May 2015 10:54:25 +0200 Subject: arch: Remove __ARCH_HAVE_CMPXCHG We removed the only user of this define in the rtmutex code. Get rid of it. Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior --- arch/x86/include/asm/cmpxchg.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 99c105d78b7e..ad19841eddfe 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -4,8 +4,6 @@ #include #include /* Provides LOCK_PREFIX */ -#define __HAVE_ARCH_CMPXCHG 1 - /* * Non-existant functions to indicate usage errors at link time * (or compile-time if the compiler implements __compiletime_error(). -- cgit v1.2.1 From af658dca221207174fc0a7bcdcd4cff7c589fdd8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 29 Apr 2015 14:36:05 -0400 Subject: tracing: Rename ftrace_event.h to trace_events.h The term "ftrace" is really the infrastructure of the function hooks, and not the trace events. Rename ftrace_event.h to trace_events.h to represent the trace_event infrastructure and decouple the term ftrace from it. Signed-off-by: Steven Rostedt --- arch/x86/kvm/mmutrace.h | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index ce463a9cc8fb..5a24b846a1cb 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -2,7 +2,7 @@ #define _TRACE_KVMMMU_H #include -#include +#include #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce741b8650f6..19788d5b1109 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7b61687bd79..9b5adf4860c2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.1 From 26e7d9dee8a5b6c844178c8e2d91be540ce311c0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 13 May 2015 19:42:22 +0200 Subject: x86/asm/uaccess: Remove FIX_ALIGNMENT define from copy_user_nocache_64.S: No code changed: # arch/x86/lib/copy_user_nocache_64.o: text data bss dec hex filename 390 0 0 390 186 copy_user_nocache_64.o.before 390 0 0 390 186 copy_user_nocache_64.o.after md5: 7fa0577b28700af89d3a67a8b590426e copy_user_nocache_64.o.before.asm 7fa0577b28700af89d3a67a8b590426e copy_user_nocache_64.o.after.asm Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431538944-27724-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/lib/copy_user_nocache_64.S | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index 6a4f43c2d9e6..42eeb12e0cd9 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -8,9 +8,6 @@ #include #include - -#define FIX_ALIGNMENT 1 - #include #include #include @@ -18,7 +15,6 @@ #include .macro ALIGN_DESTINATION -#ifdef FIX_ALIGNMENT /* check for bad alignment of destination */ movl %edi,%ecx andl $7,%ecx @@ -40,7 +36,6 @@ _ASM_EXTABLE(100b,103b) _ASM_EXTABLE(101b,103b) -#endif .endm /* -- cgit v1.2.1 From 9e6b13f761d5914a8c9b83610e8d459653515c94 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 13 May 2015 19:42:23 +0200 Subject: x86/asm/uaccess: Unify the ALIGN_DESTINATION macro Pull it up into the header and kill duplicate versions. Separately, both macros are identical: 35948b2bd3431aee7149e85cfe4becbc /tmp/a 35948b2bd3431aee7149e85cfe4becbc /tmp/b Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431538944-27724-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 25 +++++++++++++++++++++++++ arch/x86/lib/copy_user_64.S | 24 ------------------------ arch/x86/lib/copy_user_nocache_64.S | 24 ------------------------ 3 files changed, 25 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7730c1c5c83a..189679aba703 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -63,6 +63,31 @@ _ASM_ALIGN ; \ _ASM_PTR (entry); \ .popsection + +.macro ALIGN_DESTINATION + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) + .endm + #else # define _ASM_EXTABLE(from,to) \ " .pushsection \"__ex_table\",\"a\"\n" \ diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index fa997dfaef24..06ce685c3a5d 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,30 +16,6 @@ #include #include - .macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(100b,103b) - _ASM_EXTABLE(101b,103b) - .endm - /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) CFI_STARTPROC diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index 42eeb12e0cd9..b836a2bace15 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -14,30 +14,6 @@ #include #include - .macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(100b,103b) - _ASM_EXTABLE(101b,103b) - .endm - /* * copy_user_nocache - Uncached memory copy with exception handling * This will force destination/source out of cache for more performance. -- cgit v1.2.1 From b41e6ec242cba0151f0b32041cfa728e7ca6e0b7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 13 May 2015 19:42:24 +0200 Subject: x86/asm/uaccess: Get rid of copy_user_nocache_64.S Move __copy_user_nocache() to arch/x86/lib/copy_user_64.S and kill the containing file. No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431538944-27724-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/lib/Makefile | 2 +- arch/x86/lib/copy_user_64.S | 92 +++++++++++++++++++++++++++++++ arch/x86/lib/copy_user_nocache_64.S | 107 ------------------------------------ 3 files changed, 93 insertions(+), 108 deletions(-) delete mode 100644 arch/x86/lib/copy_user_nocache_64.S (limited to 'arch/x86') diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 1530afb07c85..982989d282ff 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -40,6 +40,6 @@ else lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o copy_user_nocache_64.o + lib-y += copy_user_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 06ce685c3a5d..e4b3beee83bd 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -242,3 +242,95 @@ ENTRY(copy_user_enhanced_fast_string) _ASM_EXTABLE(1b,12b) CFI_ENDPROC ENDPROC(copy_user_enhanced_fast_string) + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + ASM_STAC + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movnti %r8,(%rdi) +6: movnti %r9,1*8(%rdi) +7: movnti %r10,2*8(%rdi) +8: movnti %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movnti %r8,4*8(%rdi) +14: movnti %r9,5*8(%rdi) +15: movnti %r10,6*8(%rdi) +16: movnti %r11,7*8(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + decl %ecx + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movnti %r8,(%rdi) + leaq 8(%rsi),%rsi + leaq 8(%rdi),%rdi + decl %ecx + jnz 18b +20: andl %edx,%edx + jz 23f + movl %edx,%ecx +21: movb (%rsi),%al +22: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 21b +23: xorl %eax,%eax + ASM_CLAC + sfence + ret + + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: lea (%rdx,%rcx,8),%rdx + jmp 60f +50: movl %ecx,%edx +60: sfence + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(1b,30b) + _ASM_EXTABLE(2b,30b) + _ASM_EXTABLE(3b,30b) + _ASM_EXTABLE(4b,30b) + _ASM_EXTABLE(5b,30b) + _ASM_EXTABLE(6b,30b) + _ASM_EXTABLE(7b,30b) + _ASM_EXTABLE(8b,30b) + _ASM_EXTABLE(9b,30b) + _ASM_EXTABLE(10b,30b) + _ASM_EXTABLE(11b,30b) + _ASM_EXTABLE(12b,30b) + _ASM_EXTABLE(13b,30b) + _ASM_EXTABLE(14b,30b) + _ASM_EXTABLE(15b,30b) + _ASM_EXTABLE(16b,30b) + _ASM_EXTABLE(18b,40b) + _ASM_EXTABLE(19b,40b) + _ASM_EXTABLE(21b,50b) + _ASM_EXTABLE(22b,50b) + CFI_ENDPROC +ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S deleted file mode 100644 index b836a2bace15..000000000000 --- a/arch/x86/lib/copy_user_nocache_64.S +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright 2008 Vitaly Mayatskikh - * Copyright 2002 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination/source out of cache for more performance. - */ -ENTRY(__copy_user_nocache) - CFI_STARTPROC - ASM_STAC - cmpl $8,%edx - jb 20f /* less then 8 bytes, go to byte copy loop */ - ALIGN_DESTINATION - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz 17f -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movnti %r8,(%rdi) -6: movnti %r9,1*8(%rdi) -7: movnti %r10,2*8(%rdi) -8: movnti %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movnti %r8,4*8(%rdi) -14: movnti %r9,5*8(%rdi) -15: movnti %r10,6*8(%rdi) -16: movnti %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz 1b -17: movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz 20f -18: movq (%rsi),%r8 -19: movnti %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz 18b -20: andl %edx,%edx - jz 23f - movl %edx,%ecx -21: movb (%rsi),%al -22: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 21b -23: xorl %eax,%eax - ASM_CLAC - sfence - ret - - .section .fixup,"ax" -30: shll $6,%ecx - addl %ecx,%edx - jmp 60f -40: lea (%rdx,%rcx,8),%rdx - jmp 60f -50: movl %ecx,%edx -60: sfence - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(1b,30b) - _ASM_EXTABLE(2b,30b) - _ASM_EXTABLE(3b,30b) - _ASM_EXTABLE(4b,30b) - _ASM_EXTABLE(5b,30b) - _ASM_EXTABLE(6b,30b) - _ASM_EXTABLE(7b,30b) - _ASM_EXTABLE(8b,30b) - _ASM_EXTABLE(9b,30b) - _ASM_EXTABLE(10b,30b) - _ASM_EXTABLE(11b,30b) - _ASM_EXTABLE(12b,30b) - _ASM_EXTABLE(13b,30b) - _ASM_EXTABLE(14b,30b) - _ASM_EXTABLE(15b,30b) - _ASM_EXTABLE(16b,30b) - _ASM_EXTABLE(18b,40b) - _ASM_EXTABLE(19b,40b) - _ASM_EXTABLE(21b,50b) - _ASM_EXTABLE(22b,50b) - CFI_ENDPROC -ENDPROC(__copy_user_nocache) -- cgit v1.2.1 From be6cb02779ca74d83481f017db21578cfe92891c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 10 Apr 2015 14:08:46 +0200 Subject: x86: Align jump targets to 1-byte boundaries The following NOP in a hot function caught my attention: > 5a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) That's a dead NOP that bloats the function a bit, added for the default 16-byte alignment that GCC applies for jump targets. I realize that x86 CPU manufacturers recommend 16-byte jump target alignments (it's in the Intel optimization manual), to help their relatively narrow decoder prefetch alignment and uop cache constraints, but the cost of that is very significant: text data bss dec filename 12566391 1617840 1089536 15273767 vmlinux.align.16-byte 12224951 1617840 1089536 14932327 vmlinux.align.1-byte By using 1-byte jump target alignment (i.e. no alignment at all) we get an almost 3% reduction in kernel size (!) - and a probably similar reduction in I$ footprint. Now, the usual justification for jump target alignment is the following: - modern decoders tend to have 16-byte (effective) decoder prefetch windows. (AMD documents it higher but measurements suggest the effective prefetch window on curretn uarchs is still around 16 bytes) - on Intel there's also the uop-cache with cachelines that have 16-byte granularity and limited associativity. - older x86 uarchs had a penalty for decoder fetches that crossed 16-byte boundaries. These limits are mostly gone from recent uarchs. So if a forward jump target is aligned to cacheline boundary then prefetches will start from a new prefetch-cacheline and there's higher chance for decoding in fewer steps and packing tightly. But I think that argument is flawed for typical optimized kernel code flows: forward jumps often go to 'cold' (uncommon) pieces of code, and aligning cold code to cache lines does not bring a lot of advantages (they are uncommon), while it causes collateral damage: - their alignment 'spreads out' the cache footprint, it shifts followup hot code further out - plus it slows down even 'cold' code that immediately follows 'hot' code (like in the above case), which could have benefited from the partial cacheline that comes off the end of hot code. But even in the cache-hot case the 16 byte alignment brings disadvantages: - it spreads out the cache footprint, possibly making the code fall out of the L1 I$. - On Intel CPUs, recent microarchitectures have plenty of uop cache (typically doubling every 3 years) - while the size of the L1 cache grows much less aggressively. So workloads are rarely uop cache limited. The only situation where alignment might matter are tight loops that could fit into a single 16 byte chunk - but those are pretty rare in the kernel: if they exist they tend to be pointer chasing or generic memory ops, which both tend to be cache miss (or cache allocation) intensive and are not decoder bandwidth limited. So the balance of arguments strongly favors packing kernel instructions tightly versus maximizing for decoder bandwidth: this patch changes the jump target alignment from 16 bytes to 1 byte (tightly packed, unaligned). Acked-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Brian Gerst Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20150410120846.GA17101@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c7c31876bb40..ca17e5fcd125 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -77,6 +77,9 @@ else KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 + # Align jump targets to 1 byte, not the default 16 bytes: + KBUILD_CFLAGS += -falign-jumps=1 + # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) -- cgit v1.2.1 From 6af7faf6076697a39438cf38e21b4035e2ebdac9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 May 2015 15:48:25 +0200 Subject: x86: Use entering[_ack]_irq() instead of open coding it Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 6 ++---- arch/x86/kernel/cpu/mshyperv.c | 6 ++---- arch/x86/kernel/irq.c | 16 ++++------------ 3 files changed, 8 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b590c9d6736a..28eba2d38b15 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -542,9 +542,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; - ack_APIC_irq(); - irq_enter(); - exit_idle(); + entering_ack_irq(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -596,7 +594,7 @@ unlock: raw_spin_unlock(&desc->lock); } - irq_exit(); + exiting_irq(); } static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 939155ffdece..aad4bd84b475 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - irq_enter(); - exit_idle(); - + entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index fe2ed8bb507b..be3894512820 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -198,8 +198,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - irq_enter(); - exit_idle(); + entering_irq(); irq = __this_cpu_read(vector_irq[vector]); @@ -215,7 +214,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } } - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); return 1; @@ -250,16 +249,9 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); - - irq_enter(); - - exit_idle(); - + entering_ack_irq(); inc_irq_stat(kvm_posted_intr_ipis); - - irq_exit(); - + exiting_irq(); set_irq_regs(old_regs); } #endif -- cgit v1.2.1 From 6dc178760553605c58d78bd403dfcb4e042c5b72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 May 2015 15:50:45 +0200 Subject: x86: Consolidate irq entering inlines smp.c and irq_work.c implement the same inline helper. Move it to apic.h and use it everywhere. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra --- arch/x86/include/asm/apic.h | 6 ++++++ arch/x86/kernel/irq_work.c | 10 ++-------- arch/x86/kernel/smp.c | 19 ++++++------------- 3 files changed, 14 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 976b86a325e5..c8393634ca0c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -644,6 +644,12 @@ static inline void entering_ack_irq(void) entering_irq(); } +static inline void ipi_entering_ack_irq(void) +{ + ack_APIC_irq(); + irq_enter(); +} + static inline void exiting_irq(void) { irq_exit(); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 15d741ddfeeb..dc5fa6a1e8d6 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -10,12 +10,6 @@ #include #include -static inline void irq_work_entering_irq(void) -{ - irq_enter(); - ack_APIC_irq(); -} - static inline void __smp_irq_work_interrupt(void) { inc_irq_stat(apic_irq_work_irqs); @@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void) __visible void smp_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); __smp_irq_work_interrupt(); trace_irq_work_exit(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index be8e1bde07aa..15aaa69bbb5e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { - ack_APIC_irq(); - irq_enter(); + ipi_entering_ack_irq(); stop_this_cpu(NULL); irq_exit(); } @@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -static inline void smp_entering_irq(void) -{ - ack_APIC_irq(); - irq_enter(); -} - __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* @@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) * scheduler_ipi(). This is OK, since those functions are allowed * to nest. */ - smp_entering_irq(); + ipi_entering_ack_irq(); trace_reschedule_entry(RESCHEDULE_VECTOR); __smp_reschedule_interrupt(); trace_reschedule_exit(RESCHEDULE_VECTOR); @@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void) __visible void smp_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); __smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); @@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void) __visible void smp_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); __smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); -- cgit v1.2.1 From 52648e83c9a6b9f7fc3dd272d4d10175e93aa62a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2015 07:56:54 +0200 Subject: x86: Pack loops tightly as well Packing loops tightly (-falign-loops=1) is beneficial to code size: text data bss dec filename 12566391 1617840 1089536 15273767 vmlinux.align.16-byte 12224951 1617840 1089536 14932327 vmlinux.align.1-byte 11976567 1617840 1089536 14683943 vmlinux.align.1-byte.funcs-1-byte 11903735 1617840 1089536 14611111 vmlinux.align.1-byte.funcs-1-byte.loops-1-byte Which reduces the size of the kernel by another 0.6%, so the the total combined size reduction of the alignment-packing patches is ~5.5%. The x86 decoder bandwidth and caching arguments laid out in: be6cb02779ca ("x86: Align jump targets to 1-byte boundaries") apply to loop alignment as well. Furtermore, modern CPU uarchs have a loop cache/buffer that is a L0 cache before even any uop cache, covering a few dozen most recently executed instructions. This loop cache generally does not have the 16-byte alignment restrictions of the uop cache. Now loop alignment can still be beneficial if: - a loop is cache-hot and its surroundings are not. - if the loop is so cache hot that the instruction flow becomes x86 decoder bandwidth limited But loop alignment is harmful if: - a loop is cache-cold - a loop's surroundings are cache-hot as well - two cache-hot loops are close to each other - if the loop fits into the loop cache - if the code flow is not decoder bandwidth limited and I'd argue that the latter five scenarios are much more common in the kernel, as our hottest loops are typically: - pointer chasing: this should fit into the loop cache in most cases and is typically data cache and address generation limited - generic memory ops (memset, memcpy, etc.): these generally fit into the loop cache as well, and are likewise data cache limited. So this patch packs loop addresses tightly as well. Acked-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Brian Gerst Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20150410123017.GB19918@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index ca17e5fcd125..57996ee840dd 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -80,6 +80,9 @@ else # Align jump targets to 1 byte, not the default 16 bytes: KBUILD_CFLAGS += -falign-jumps=1 + # Pack loops tightly as well: + KBUILD_CFLAGS += -falign-loops=1 + # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) -- cgit v1.2.1 From e839004b49c571e20006092cbe9da8f2c95d2e71 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 16 May 2015 18:17:59 +0200 Subject: x86/asm/head*.S: Change global labels to local Make the disassembly look less confusing: -- head_64.o.before.asm ++ head_64.o.after.asm 0000000000000120 : 120: fc cld 121: 83 3c 24 02 cmpl $0x2,(%rsp) - 125: 0f 84 9d 00 00 00 je 1c8 + 125: 0f 84 9d 00 00 00 je 1c8 12b: 83 3d 00 00 00 00 02 cmpl $0x2,0x0(%rip) # 132 132: 74 7e je 1b2 134: ff 05 00 00 00 00 incl 0x0(%rip) # 13a @@ -1198,9 +1198,7 @@ Disassembly of section .init.text: 1bf: 5a pop %rdx 1c0: 59 pop %rcx 1c1: 58 pop %rax - 1c2: ff 0d 00 00 00 00 decl 0x0(%rip) # 1c8 - -00000000000001c8 : + 1c2: ff 0d 00 00 00 00 decl 0x0(%rip) # 1c8 1c8: 48 83 c4 10 add $0x10,%rsp 1cc: 48 cf iretq -- head_32.o.before.asm ++ head_32.o.after.asm 0000016c : 16c: fc cld 16d: 83 3c 24 02 cmpl $0x2,(%esp) - 171: 74 73 je 1e6 + 171: 74 73 je 1e6 173: 36 83 3d 00 00 00 00 cmpl $0x2,%ss:0x0 17a: 02 17b: 74 5a je 1d7 @@ -483,8 +483,6 @@ Disassembly of section .init.text: 1dd: 59 pop %ecx 1de: 58 pop %eax 1df: 36 ff 0d 00 00 00 00 decl %ss:0x0 - -000001e6 : 1e6: 83 c4 08 add $0x8,%esp 1e9: cf iret 1ea: 66 90 xchg %ax,%ax No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431793079-11153-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 4 ++-- arch/x86/kernel/head_64.S | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d031bad9e07e..02d257256200 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -547,7 +547,7 @@ ENTRY(early_idt_handler) cld cmpl $2,(%esp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,%ss:early_recursion_flag je hlt_loop @@ -600,7 +600,7 @@ ex_entry: pop %ecx pop %eax decl %ss:early_recursion_flag -is_nmi: +.Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret ENDPROC(early_idt_handler) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ae6588b301c2..43eafc8afb69 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -344,7 +344,7 @@ ENTRY(early_idt_handler) cld cmpl $2,(%rsp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,early_recursion_flag(%rip) jz 1f @@ -409,7 +409,7 @@ ENTRY(early_idt_handler) popq %rcx popq %rax decl early_recursion_flag(%rip) -is_nmi: +.Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN ENDPROC(early_idt_handler) -- cgit v1.2.1 From adeb5537849d9db428fe0ddc3562e5a765a347e2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 15 May 2015 22:39:06 +0200 Subject: x86/asm/entry/64: Use shorter MOVs from segment registers The "movw %ds,%cx" instruction needs a 0x66 prefix, while "movl %ds,%ecx" does not. The difference is that latter form (on 64-bit CPUs) overwrites the entire %ecx, not only its lower half. But subsequent code doesn't depend on the value of upper half of %ecx, so we can safely use the shorter instruction. The new code is also faster than the old one - now we don't depend on the old value of %ecx, but this code fragment is not performance-critical so it does not matter much. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1431722346-26585-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 09c3f9e0e07e..47b95813dc37 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1190,17 +1190,17 @@ ENTRY(xen_failsafe_callback) /*CFI_REL_OFFSET ds,DS*/ CFI_REL_OFFSET r11,8 CFI_REL_OFFSET rcx,0 - movw %ds,%cx + movl %ds,%ecx cmpw %cx,0x10(%rsp) CFI_REMEMBER_STATE jne 1f - movw %es,%cx + movl %es,%ecx cmpw %cx,0x18(%rsp) jne 1f - movw %fs,%cx + movl %fs,%ecx cmpw %cx,0x20(%rsp) jne 1f - movw %gs,%cx + movl %gs,%ecx cmpw %cx,0x28(%rsp) jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ -- cgit v1.2.1 From 8de3eafc161022dd094fa009346509c712e9c4b0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 17 May 2015 12:54:58 +0200 Subject: x86/microcode/intel: Rename get_matching_microcode ... to has_newer_microcode() as it does exactly that: checks whether binary data @mc has newer microcode patch than the applied one. Move @mc to be the first function arg too. Signed-off-by: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431860101-14847-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode_intel.h | 2 +- arch/x86/kernel/cpu/microcode/intel.c | 4 ++-- arch/x86/kernel/cpu/microcode/intel_early.c | 8 ++++---- arch/x86/kernel/cpu/microcode/intel_lib.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 3564299863f0..8e87e6fe98b5 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -56,7 +56,7 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc); +extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); extern int microcode_sanity_check(void *mc, int print_err); extern int get_matching_sig(unsigned int csig, int cpf, void *mc); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index e20d4e58cd89..969dc17eb1b4 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -63,7 +63,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) cpf = cpu_sig.pf; crev = cpu_sig.rev; - return get_matching_microcode(csig, cpf, crev, mc_intel); + return has_newer_microcode(mc_intel, csig, cpf, crev); } static int apply_microcode_intel(int cpu) @@ -165,7 +165,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, csig = uci->cpu_sig.sig; cpf = uci->cpu_sig.pf; - if (get_matching_microcode(csig, cpf, new_rev, mc)) { + if (has_newer_microcode(mc, csig, cpf, new_rev)) { vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index ccd474a7a59e..5f828268357d 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -59,10 +59,10 @@ load_microcode_early(struct microcode_intel **saved, ucode_ptr = saved[i]; mc_hdr = (struct microcode_header_intel *)ucode_ptr; - ret = get_matching_microcode(uci->cpu_sig.sig, - uci->cpu_sig.pf, - new_rev, - ucode_ptr); + ret = has_newer_microcode(ucode_ptr, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + new_rev); if (!ret) continue; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 7de293726923..425f8e29b795 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -154,7 +154,7 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc) /* * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_microcode(unsigned int csig, int cpf, int new_rev, void *mc) +int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) { struct microcode_header_intel *mc_hdr = mc; @@ -163,4 +163,4 @@ int get_matching_microcode(unsigned int csig, int cpf, int new_rev, void *mc) return get_matching_sig(csig, cpf, mc); } -EXPORT_SYMBOL_GPL(get_matching_microcode); +EXPORT_SYMBOL_GPL(has_newer_microcode); -- cgit v1.2.1 From 6b2d469f5b5dd1d39548f2e79557b9b5ffe00eb1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 17 May 2015 12:54:59 +0200 Subject: x86/microcode/intel: Simplify update_match_cpu() Drop unreadable macro, deconstruct compound conditional statement into single ones and return early if they match. Add comments. There should be no functionality change resulting from this patch. Signed-off-by: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431860101-14847-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode_intel.h | 3 --- arch/x86/kernel/cpu/microcode/intel_lib.c | 19 +++++++++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 8e87e6fe98b5..45a318f677be 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -51,9 +51,6 @@ struct extended_sigtable { (((struct microcode_intel *)mc)->hdr.datasize ? \ ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 425f8e29b795..1ffe507931af 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -31,11 +31,18 @@ #include #include -static inline int -update_match_cpu(unsigned int csig, unsigned int cpf, - unsigned int sig, unsigned int pf) +static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) { - return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; + if (s1 != s2) + return false; + + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; + + /* ... or they intersect. */ + return p1 & p2; } int microcode_sanity_check(void *mc, int print_err) @@ -132,7 +139,7 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc) int ext_sigcount, i; struct extended_signature *ext_sig; - if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) + if (cpu_signatures_match(csig, cpf, mc_header->sig, mc_header->pf)) return 1; /* Look for ext. headers: */ @@ -144,7 +151,7 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc) ext_sig = (void *)ext_header + EXT_HEADER_SIZE; for (i = 0; i < ext_sigcount; i++) { - if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) + if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } -- cgit v1.2.1 From 9e5aed83bbd95ca2dee732f56a7a278350cef807 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 17 May 2015 12:55:00 +0200 Subject: x86/microcode/intel: Simplify get_matching_sig() Unclutter function, make it a bit more readable, drop local variables. No functionality change. Signed-off-by: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431860101-14847-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel_lib.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 1ffe507931af..def9c935f3f9 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -133,24 +133,22 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check); */ int get_matching_sig(unsigned int csig, int cpf, void *mc) { - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - int ext_sigcount, i; + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; struct extended_signature *ext_sig; + int i; - if (cpu_signatures_match(csig, cpf, mc_header->sig, mc_header->pf)) + if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ - if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) return 0; - ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - for (i = 0; i < ext_sigcount; i++) { + for (i = 0; i < ext_hdr->count; i++) { if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; -- cgit v1.2.1 From e774eaa9f6069b70b5208aa50539a09f41cf7e73 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 17 May 2015 12:55:01 +0200 Subject: x86/microcode/intel: Rename get_matching_sig() ... to find_matching_signature() which is exactly what it does. No functionality change. Signed-off-by: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431860101-14847-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode_intel.h | 2 +- arch/x86/kernel/cpu/microcode/intel_early.c | 2 +- arch/x86/kernel/cpu/microcode/intel_lib.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 45a318f677be..7991c606125d 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -55,7 +55,7 @@ struct extended_sigtable { extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); extern int microcode_sanity_check(void *mc, int print_err); -extern int get_matching_sig(unsigned int csig, int cpf, void *mc); +extern int find_matching_signature(void *mc, unsigned int csig, int cpf); #ifdef CONFIG_MICROCODE_INTEL_EARLY extern void __init load_ucode_intel_bsp(void); diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 5f828268357d..10dff3f3f686 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -256,7 +256,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, sig = mc_saved_hdr->sig; pf = mc_saved_hdr->pf; - if (!get_matching_sig(sig, pf, ucode_ptr)) + if (!find_matching_signature(ucode_ptr, sig, pf)) continue; found = 1; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index def9c935f3f9..1883d252ff7d 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check); /* * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_sig(unsigned int csig, int cpf, void *mc) +int find_matching_signature(void *mc, unsigned int csig, int cpf) { struct microcode_header_intel *mc_hdr = mc; struct extended_sigtable *ext_hdr; @@ -166,6 +166,6 @@ int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) if (mc_hdr->rev <= new_rev) return 0; - return get_matching_sig(csig, cpf, mc); + return find_matching_signature(mc, csig, cpf); } EXPORT_SYMBOL_GPL(has_newer_microcode); -- cgit v1.2.1 From 7cb685982157567dcc55eb92d1c38d237465203b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 May 2015 12:05:13 +0200 Subject: x86/smp/boot: Fix legacy SMP bootup slow-boot bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So while testing kernels using tools/kvm/ (kvmtool) I noticed that it booted super slow: [ 0.142991] Performance Events: no PMU driver, software events only. [ 0.149265] x86: Booting SMP configuration: [ 0.149765] .... node #0, CPUs: #1 [ 0.148304] kvm-clock: cpu 1, msr 2:1bfe9041, secondary cpu clock [ 10.158813] KVM setup async PF for cpu 1 [ 10.159000] #2 [ 10.159000] kvm-stealtime: cpu 1, msr 211a4d400 [ 10.158829] kvm-clock: cpu 2, msr 2:1bfe9081, secondary cpu clock [ 20.167805] KVM setup async PF for cpu 2 [ 20.168000] #3 [ 20.168000] kvm-stealtime: cpu 2, msr 211a8d400 [ 20.167818] kvm-clock: cpu 3, msr 2:1bfe90c1, secondary cpu clock [ 30.176902] KVM setup async PF for cpu 3 [ 30.177000] #4 [ 30.177000] kvm-stealtime: cpu 3, msr 211acd400 One CPU booted up per 10 seconds. With 120 CPUs that takes a while. Bisection pinpointed this commit: 853b160aaafb ("Revert f5d6a52f5111 ("x86/smpboot: Skip delays during SMP initialization similar to Xen")") But that commit just restores previous behavior, so it cannot cause the problem. After some head scratching it turns out that these two commits: 1a744cb356c5 ("x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors") d68921f9bd14 ("x86/smp/boot: Add cmdline "cpu_init_udelay=N" to specify cpu_up() delay") added the following code to smpboot.c: - mdelay(10); + mdelay(init_udelay); Note the mismatch in the units: the delay is called 'udelay' and is set to microseconds - while the function used here is actually 'mdelay', which counts in milliseconds ... So the delay for legacy systems is off by a factor of 1,000, so instead of 10 msecs we waited for 10 seconds ... The reason bisection pointed to 853b160aaafb was that 853b160aaafb removed a (broken) boot-time speedup patch, which masked the factor 1,000 bug. Fix it by using udelay(). This fixes my bootup problems. Cc: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b9aaa3930b2f..fd6291c921b6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -617,7 +617,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); - mdelay(init_udelay); + udelay(init_udelay); pr_debug("Deasserting INIT\n"); -- cgit v1.2.1 From ab3f02fc237211f0583c1e7ba3bf504747be9b8d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2015 10:52:27 +0200 Subject: locking/arch: Add WRITE_ONCE() to set_mb() Since we assume set_mb() to result in a single store followed by a full memory barrier, employ WRITE_ONCE(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/barrier.h | 2 +- arch/x86/um/asm/barrier.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 959e45b81fe2..9de5cde133a1 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -40,7 +40,7 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) +#define set_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #endif /* SMP */ #define read_barrier_depends() do { } while (0) diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 7e8a1a650435..cc0cb01f346d 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -39,7 +39,7 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) +#define set_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) -- cgit v1.2.1 From b92b8b35a2e38bde319fd1d68ec84628c1f1b0fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2015 10:51:55 +0200 Subject: locking/arch: Rename set_mb() to smp_store_mb() Since set_mb() is really about an smp_mb() -- not a IO/DMA barrier like mb() rename it to match the recent smp_load_acquire() and smp_store_release(). Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/barrier.h | 4 ++-- arch/x86/um/asm/barrier.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 9de5cde133a1..e51a8f803f55 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -35,12 +35,12 @@ #define smp_mb() mb() #define smp_rmb() dma_rmb() #define smp_wmb() barrier() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #endif /* SMP */ #define read_barrier_depends() do { } while (0) diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index cc0cb01f346d..b9531d343134 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -39,7 +39,8 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) + +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) -- cgit v1.2.1 From b3c395ef5556a6c60f4426cc060f5b7bdcf82d5b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:08 +0200 Subject: mm/uaccess, mm/fault: Clarify that uaccess may only sleep if pagefaults are enabled In general, non-atomic variants of user access functions must not sleep if pagefaults are disabled. Let's update all relevant comments in uaccess code. This also reflects the might_sleep() checks in might_fault(). Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-4-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 15 ++++++++++----- arch/x86/include/asm/uaccess_32.h | 6 ++++-- arch/x86/lib/usercopy_32.c | 6 ++++-- 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ace9dec050b1..a8df874f3e88 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -74,7 +74,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * @addr: User space pointer to start of block to check * @size: Size of block to check * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Checks if a pointer to a block of memory in user space is valid. * @@ -145,7 +146,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -240,7 +242,8 @@ extern void __put_user_8(void); * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -455,7 +458,8 @@ struct __large_struct { unsigned long buf[100]; }; * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -479,7 +483,8 @@ struct __large_struct { unsigned long buf[100]; }; * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 3c03a5de64d3..7c8ad3451988 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -70,7 +70,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. @@ -117,7 +118,8 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. Caller must check * the specified block with access_ok() before calling this function. diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index e2f5e21c03b3..91d93b95bd86 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -647,7 +647,8 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. * @@ -668,7 +669,8 @@ EXPORT_SYMBOL(_copy_to_user); * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. * -- cgit v1.2.1 From 2cb7c9cb426660b5ed58b643d9e7dd5d50ba901f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:09 +0200 Subject: sched/preempt, mm/kmap: Explicitly disable/enable preemption in kmap_atomic_* The existing code relies on pagefault_disable() implicitly disabling preemption, so that no schedule will happen between kmap_atomic() and kunmap_atomic(). Let's make this explicit, to prepare for pagefault_disable() not touching preemption anymore. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-5-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 3 ++- arch/x86/mm/iomap_32.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 4500142bc4aa..eecb207a2037 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) unsigned long vaddr; int idx, type; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) @@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr) #endif pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 9ca35fc60cfe..2b7ece0e103a 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -59,6 +59,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) unsigned long vaddr; int idx, type; + preempt_disable(); pagefault_disable(); type = kmap_atomic_idx_push(); @@ -117,5 +118,6 @@ iounmap_atomic(void __iomem *kvaddr) } pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL_GPL(iounmap_atomic); -- cgit v1.2.1 From 70ffdb9393a7264a069265edded729078dcf0425 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:11 +0200 Subject: mm/fault, arch: Use pagefault_disable() to check for disabled pagefaults in the handler Introduce faulthandler_disabled() and use it to check for irq context and disabled pagefaults (via pagefault_disable()) in the pagefault handlers. Please note that we keep the in_atomic() checks in place - to detect whether in irq context (in which case preemption is always properly disabled). In contrast, preempt_disable() should never be used to disable pagefaults. With !CONFIG_PREEMPT_COUNT, preempt_disable() doesn't modify the preempt counter, and therefore the result of in_atomic() differs. We validate that condition by using might_fault() checks when calling might_sleep(). Therefore, add a comment to faulthandler_disabled(), describing why this is needed. faulthandler_disabled() and pagefault_disable() are defined in linux/uaccess.h, so let's properly add that include to all relevant files. This patch is based on a patch from Thomas Gleixner. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-7-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 181c53bac3a7..9dc909841739 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -13,6 +13,7 @@ #include /* hstate_index_to_shift */ #include /* prefetchw */ #include /* exception_enter(), ... */ +#include /* faulthandler_disabled() */ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ @@ -1126,9 +1127,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, /* * If we're in an interrupt, have no user context or are running - * in an atomic region then we must not take the fault: + * in a region with pagefaults disabled then we must not take the fault */ - if (unlikely(in_atomic() || !mm)) { + if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } -- cgit v1.2.1 From ea6cd25058f39ac69623efdcbd94a7fc7d4d13f0 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 9 May 2015 20:27:37 -0400 Subject: x86: Rename eisa_set_level_irq to elcr_set_level_irq This routine has been around for over a decade, but with EISA being dead and abandoned for about twice that long, the name can be kind of confusing. The function is going at the PIC Edge/Level Configuration Registers (ELCR), so rename it as such and mentally decouple it from the long since dead EISA bus. Signed-off-by: Paul Gortmaker Reviewed-by: Maciej W. Rozycki Acked-by: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Bjorn Helgaas Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1431217657-934-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 3 +-- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/pci/irq.c | 13 +++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9472c9aff26d..9ec5d37d8da3 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -157,8 +157,7 @@ static inline void unlock_vector_lock(void) {} extern atomic_t irq_err_count; extern atomic_t irq_mis_count; -/* EISA */ -extern void eisa_set_level_irq(unsigned int irq); +extern void elcr_set_level_irq(unsigned int irq); /* SMP */ extern __visible void smp_apic_timer_interrupt(struct pt_regs *); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 271293ad89d7..e49ee24da85e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -608,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi, * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (trigger == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); + elcr_set_level_irq(gsi); #endif return gsi; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 5dc6ca5e1741..9bd115484745 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -146,19 +146,20 @@ static void __init pirq_peer_trick(void) /* * Code for querying and setting of IRQ routes on various interrupt routers. + * PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1. */ -void eisa_set_level_irq(unsigned int irq) +void elcr_set_level_irq(unsigned int irq) { unsigned char mask = 1 << (irq & 7); unsigned int port = 0x4d0 + (irq >> 3); unsigned char val; - static u16 eisa_irq_mask; + static u16 elcr_irq_mask; - if (irq >= 16 || (1 << irq) & eisa_irq_mask) + if (irq >= 16 || (1 << irq) & elcr_irq_mask) return; - eisa_irq_mask |= (1 << irq); + elcr_irq_mask |= (1 << irq); printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq); val = inb(port); if (!(val & mask)) { @@ -965,11 +966,11 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { msg = "found"; - eisa_set_level_irq(irq); + elcr_set_level_irq(irq); } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { if (r->set(pirq_router_dev, dev, pirq, newirq)) { - eisa_set_level_irq(newirq); + elcr_set_level_irq(newirq); msg = "assigned"; irq = newirq; } -- cgit v1.2.1 From 0a7815515471a80379bfefc9f1913e0d8c87fbfb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 10:58:52 +0200 Subject: x86/fpu: Rename unlazy_fpu() to fpu__save() This function is a misnomer on two levels: 1) it doesn't really manipulate TS on modern CPUs anymore, its primary purpose is to save FPU state, used: - when executing fork()/clone(): to copy current FPU state to the child's FPU state. - when handling math exceptions: to generate the math error si_code in the signal frame. 2) even on legacy CPUs it doesn't actually 'unlazy', if then it lazies the FPU state: as a side effect of the old FNSAVE instruction which clears (destroys) FPU state it's necessary to set CR0::TS. So rename it to fpu__save() to better reflect its purpose. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 +- arch/x86/include/asm/i387.h | 2 +- arch/x86/kernel/i387.c | 6 +++--- arch/x86/kernel/traps.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index da5e96756570..49db0440b0ed 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -602,7 +602,7 @@ static inline void fpu_copy(struct task_struct *dst, struct task_struct *src) struct fpu *dfpu = &dst->thread.fpu; struct fpu *sfpu = &src->thread.fpu; - unlazy_fpu(src); + fpu__save(src); memcpy(dfpu->state, sfpu->state, xstate_size); } } diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 6eb6fcb83f63..d4419da9b210 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -101,7 +101,7 @@ static inline int user_has_fpu(void) return current->thread.fpu.has_fpu; } -extern void unlazy_fpu(struct task_struct *tsk); +extern void fpu__save(struct task_struct *tsk); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 009183276bb7..ec1a744eb853 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -117,7 +117,7 @@ void __kernel_fpu_end(void) } EXPORT_SYMBOL(__kernel_fpu_end); -void unlazy_fpu(struct task_struct *tsk) +void fpu__save(struct task_struct *tsk) { preempt_disable(); if (__thread_has_fpu(tsk)) { @@ -130,7 +130,7 @@ void unlazy_fpu(struct task_struct *tsk) } preempt_enable(); } -EXPORT_SYMBOL(unlazy_fpu); +EXPORT_SYMBOL(fpu__save); unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; @@ -251,7 +251,7 @@ int init_fpu(struct task_struct *tsk) if (tsk_used_math(tsk)) { if (cpu_has_fpu && tsk == current) - unlazy_fpu(tsk); + fpu__save(tsk); task_disable_lazy_fpu_restore(tsk); return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 324ab5247687..12f29f9907cd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -731,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) /* * Save the info for the exception handler and clear the error. */ - unlazy_fpu(task); + fpu__save(task); task->thread.trap_nr = trapnr; task->thread.error_code = error_code; info.si_signo = SIGFPE; -- cgit v1.2.1 From 4af08f2f47f0ed3c48e557ac47ed2ae616d6866b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 11:01:36 +0200 Subject: x86/fpu: Add comments to fpu__save() and restrict its export Add an explanation to fpu__save() and also don't export it to random modules - we don't want them to futz around with deep kernel internals. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index ec1a744eb853..ac47278cde71 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -117,6 +117,9 @@ void __kernel_fpu_end(void) } EXPORT_SYMBOL(__kernel_fpu_end); +/* + * Save the FPU state (initialize it if necessary): + */ void fpu__save(struct task_struct *tsk) { preempt_disable(); @@ -130,7 +133,7 @@ void fpu__save(struct task_struct *tsk) } preempt_enable(); } -EXPORT_SYMBOL(fpu__save); +EXPORT_SYMBOL_GPL(fpu__save); unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; -- cgit v1.2.1 From 87cdb98affe54b6f390f4622b254569b90820817 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 11:06:43 +0200 Subject: x86/fpu: Add debugging check to fpu__save() Document the function a bit more and add debugging check that we are only running this with the current task. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index ac47278cde71..66f1053ae2cd 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -119,9 +119,13 @@ EXPORT_SYMBOL(__kernel_fpu_end); /* * Save the FPU state (initialize it if necessary): + * + * This only ever gets called for the current task. */ void fpu__save(struct task_struct *tsk) { + WARN_ON(tsk != current); + preempt_disable(); if (__thread_has_fpu(tsk)) { if (use_eager_fpu()) { -- cgit v1.2.1 From 1a7dc0db7181ebc8b9ec17e5a15ad4c766c7d3d4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 11:33:52 +0200 Subject: x86/fpu: Rename fpu_detect() to fpu__detect() Use the fpu__*() namespace to organize FPU ops better. Also document fpu__detect() a bit. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/i387.c | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 23ba6765b718..2dc08c231a9a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -166,7 +166,7 @@ extern const struct seq_operations cpuinfo_op; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); -extern void fpu_detect(struct cpuinfo_x86 *c); +extern void fpu__detect(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_boot_cpu(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a62cf04dac8a..60a29d290e2a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -759,7 +759,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); - fpu_detect(c); + fpu__detect(c); if (this_cpu->c_early_init) this_cpu->c_early_init(c); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 66f1053ae2cd..29251f5668b1 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -640,7 +640,11 @@ static int __init no_387(char *s) __setup("no387", no_387); -void fpu_detect(struct cpuinfo_x86 *c) +/* + * Set the X86_FEATURE_FPU CPU-capability bit based on + * trying to execute an actual sequence of FPU instructions: + */ +void fpu__detect(struct cpuinfo_x86 *c) { unsigned long cr0; u16 fsw, fcw; -- cgit v1.2.1 From 68ad8b9feae583cc5102ec86ad410fce542a5311 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 11:50:20 +0200 Subject: x86/fpu: Remove stale init_fpu() prototype We are going to split init_fpu() so keep only a single prototype, in i387.h. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xsave.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index c9a6d68b8d62..58ed0ca5a11e 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -51,7 +51,6 @@ extern struct xsave_struct *init_xstate_buf; extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); -extern int init_fpu(struct task_struct *child); /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -- cgit v1.2.1 From 97185c95f7ab7f752473c34672dab0925758094b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:02:02 +0200 Subject: x86/fpu: Split an fpstate_alloc_init() function out of init_fpu() Most init_fpu() users don't want the register-saving aspect of the function, they are calling it for 'current' and when FPU registers are not allocated and initialized yet. Split out a simplified API that does just that (and add debug-checks for these conditions): fpstate_alloc_init(). Use it where appropriate. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 3 +++ arch/x86/kernel/i387.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/process.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/kernel/xsave.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- 7 files changed, 39 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index d4419da9b210..1a896b4533c4 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -18,7 +18,10 @@ struct pt_regs; struct user_i387_struct; +extern int fpstate_alloc_init(struct task_struct *curr); + extern int init_fpu(struct task_struct *child); + extern void fpu_finit(struct fpu *fpu); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void math_state_restore(void); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 29251f5668b1..56b6e726fb60 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -246,6 +246,37 @@ void fpu_finit(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpu_finit); +/* + * Allocate the backing store for the current task's FPU registers + * and initialize the registers themselves as well. + * + * Can fail. + */ +int fpstate_alloc_init(struct task_struct *curr) +{ + int ret; + + if (WARN_ON_ONCE(curr != current)) + return -EINVAL; + if (WARN_ON_ONCE(curr->flags & PF_USED_MATH)) + return -EINVAL; + + /* + * Memory allocation at the first usage of the FPU and other state. + */ + ret = fpu_alloc(&curr->thread.fpu); + if (ret) + return ret; + + fpu_finit(&curr->thread.fpu); + + /* Safe to do for the current task: */ + curr->flags |= PF_USED_MATH; + + return 0; +} +EXPORT_SYMBOL_GPL(fpstate_alloc_init); + /* * The _current_ task is using the FPU for the first time * so initialize it and set the mxcsr to its default diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6e338e3b1dc0..abdb81d07423 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -159,7 +159,7 @@ void flush_thread(void) } else { if (!tsk_used_math(tsk)) { /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(init_fpu(tsk))) + if (WARN_ON(fpstate_alloc_init(tsk))) force_sig(SIGKILL, tsk); user_fpu_begin(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 12f29f9907cd..cf9c9627be19 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -846,7 +846,7 @@ void math_state_restore(void) /* * does a slab alloc which can sleep */ - if (init_fpu(tsk)) { + if (fpstate_alloc_init(tsk)) { /* * ran out of memory! */ diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 87a815b85f3e..a977cdd03825 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -349,7 +349,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - if (!used_math() && init_fpu(tsk)) + if (!used_math() && fpstate_alloc_init(tsk)) return -1; if (!static_cpu_has(X86_FEATURE_FPU)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c73efcd03e29..bfc396632ee8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6600,7 +6600,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - if (!tsk_used_math(current) && init_fpu(current)) + if (!tsk_used_math(current) && fpstate_alloc_init(current)) return -ENOMEM; if (vcpu->sigset_active) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 9b868124128d..c9ff09a02385 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -149,7 +149,7 @@ void math_emulate(struct math_emu_info *info) struct desc_struct code_descriptor; if (!used_math()) { - if (init_fpu(current)) { + if (fpstate_alloc_init(current)) { do_group_exit(SIGKILL); return; } -- cgit v1.2.1 From bda283796b38baae3ec5c8c788b143b1fb9dcc77 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:17:28 +0200 Subject: x86/fpu: Make init_fpu() static Now that the allocation users have been split off into a separate function, init_fpu() has become local to i387.c: make it static. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 2 -- arch/x86/kernel/i387.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 1a896b4533c4..0367d17371f5 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -20,8 +20,6 @@ struct user_i387_struct; extern int fpstate_alloc_init(struct task_struct *curr); -extern int init_fpu(struct task_struct *child); - extern void fpu_finit(struct fpu *fpu); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void math_state_restore(void); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 56b6e726fb60..95079026c386 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -283,7 +283,7 @@ EXPORT_SYMBOL_GPL(fpstate_alloc_init); * value at reset if we support XMM instructions and then * remember the current task has used the FPU. */ -int init_fpu(struct task_struct *tsk) +static int init_fpu(struct task_struct *tsk) { int ret; @@ -306,7 +306,6 @@ int init_fpu(struct task_struct *tsk) set_stopped_child_used_math(tsk); return 0; } -EXPORT_SYMBOL_GPL(init_fpu); /* * The xstateregs_active() routine is the same as the fpregs_active() routine, -- cgit v1.2.1 From 67e97fc2ec575adaacca296834f9ea0e1e34b563 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:21:03 +0200 Subject: x86/fpu: Rename init_fpu() to fpu__unlazy_stopped() and add debugging check This function name is a misnomer now that we've split out all the other users from it. Rename it accordingly: it's used to save the FPU state of (ptrace-)stopped child tasks. Add debugging check to double check this intended usage: that this function is only called for non-current, stopped child tasks. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 95079026c386..909d10d2fb70 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -283,27 +283,30 @@ EXPORT_SYMBOL_GPL(fpstate_alloc_init); * value at reset if we support XMM instructions and then * remember the current task has used the FPU. */ -static int init_fpu(struct task_struct *tsk) +static int fpu__unlazy_stopped(struct task_struct *child) { int ret; - if (tsk_used_math(tsk)) { - if (cpu_has_fpu && tsk == current) - fpu__save(tsk); - task_disable_lazy_fpu_restore(tsk); + if (WARN_ON_ONCE(child == current)) + return -EINVAL; + + if (tsk_used_math(child)) { + if (cpu_has_fpu && child == current) + fpu__save(child); + task_disable_lazy_fpu_restore(child); return 0; } /* * Memory allocation at the first usage of the FPU and other state. */ - ret = fpu_alloc(&tsk->thread.fpu); + ret = fpu_alloc(&child->thread.fpu); if (ret) return ret; - fpu_finit(&tsk->thread.fpu); + fpu_finit(&child->thread.fpu); - set_stopped_child_used_math(tsk); + set_stopped_child_used_math(child); return 0; } @@ -331,7 +334,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - ret = init_fpu(target); + ret = fpu__unlazy_stopped(target); if (ret) return ret; @@ -350,7 +353,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - ret = init_fpu(target); + ret = fpu__unlazy_stopped(target); if (ret) return ret; @@ -384,7 +387,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - ret = init_fpu(target); + ret = fpu__unlazy_stopped(target); if (ret) return ret; @@ -414,7 +417,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - ret = init_fpu(target); + ret = fpu__unlazy_stopped(target); if (ret) return ret; @@ -577,7 +580,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - ret = init_fpu(target); + ret = fpu__unlazy_stopped(target); if (ret) return ret; @@ -608,7 +611,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - ret = init_fpu(target); + ret = fpu__unlazy_stopped(target); if (ret) return ret; -- cgit v1.2.1 From 8694c3e793810de3f7ff612d0fc23b70dc5a7e4d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:28:15 +0200 Subject: x86/fpu: Optimize fpu__unlazy_stopped() This function is only called for stopped child tasks, so the fpu__save() branch will never get called - remove it. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 909d10d2fb70..76006a701dbb 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -291,8 +291,6 @@ static int fpu__unlazy_stopped(struct task_struct *child) return -EINVAL; if (tsk_used_math(child)) { - if (cpu_has_fpu && child == current) - fpu__save(child); task_disable_lazy_fpu_restore(child); return 0; } -- cgit v1.2.1 From 071ae621ec51153d740c9a2252d2d46e03e80d58 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:29:47 +0200 Subject: x86/fpu: Simplify fpu__unlazy_stopped() Open code the PF_USED_MATH logic, to make the logic more obvious. (We'll slowly convert the other users of *_used_math() methods as well.) Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 76006a701dbb..5e4dae70ffa5 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -290,7 +290,7 @@ static int fpu__unlazy_stopped(struct task_struct *child) if (WARN_ON_ONCE(child == current)) return -EINVAL; - if (tsk_used_math(child)) { + if (child->flags & PF_USED_MATH) { task_disable_lazy_fpu_restore(child); return 0; } @@ -304,7 +304,9 @@ static int fpu__unlazy_stopped(struct task_struct *child) fpu_finit(&child->thread.fpu); - set_stopped_child_used_math(child); + /* Safe to do for stopped child tasks: */ + child->flags |= PF_USED_MATH; + return 0; } -- cgit v1.2.1 From 373244221ade7c4d8d4fa970355ca3803711fbff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:34:26 +0200 Subject: x86/fpu: Remove fpu_allocated() It's an unnecessary obfuscation of a very simple allocation pattern. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 49db0440b0ed..21000f0f0ae1 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -569,14 +569,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } -static bool fpu_allocated(struct fpu *fpu) -{ - return fpu->state != NULL; -} - static inline int fpu_alloc(struct fpu *fpu) { - if (fpu_allocated(fpu)) + if (fpu->state) return 0; fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); if (!fpu->state) -- cgit v1.2.1 From 6fbe67124869be10e8b0ceedc4ee7c5691d78c40 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:37:30 +0200 Subject: x86/fpu: Move fpu_alloc() out of line This is not a small function, and it's used in several places, one of them a popular module (KVM). Move the function out of line. This saves a bit of text, even with the symbol export overhead: text data bss dec hex filename 12566052 1619504 1089536 15275092 e91454 vmlinux.before 12566046 1619504 1089536 15275086 e9144e vmlinux.after Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 11 +---------- arch/x86/kernel/i387.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 21000f0f0ae1..bdbba1a4de69 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -569,16 +569,7 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } -static inline int fpu_alloc(struct fpu *fpu) -{ - if (fpu->state) - return 0; - fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); - if (!fpu->state) - return -ENOMEM; - WARN_ON((unsigned long)fpu->state & 15); - return 0; -} +extern int fpu_alloc(struct fpu *fpu); static inline void fpu_free(struct fpu *fpu) { diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 5e4dae70ffa5..05fcc90087b0 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -246,6 +246,18 @@ void fpu_finit(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpu_finit); +int fpu_alloc(struct fpu *fpu) +{ + if (fpu->state) + return 0; + fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); + if (!fpu->state) + return -ENOMEM; + WARN_ON((unsigned long)fpu->state & 15); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_alloc); + /* * Allocate the backing store for the current task's FPU registers * and initialize the registers themselves as well. -- cgit v1.2.1 From ed97b085461ca6bdc22162b68b4cba69459ce1c8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:41:14 +0200 Subject: x86/fpu: Rename fpu_alloc() to fpstate_alloc() Use the fpu__*() namespace for fpstate_alloc() as well. Also add a comment about FPU state alignment. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 +- arch/x86/kernel/i387.c | 12 ++++++++---- arch/x86/kernel/process.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index bdbba1a4de69..88f584ab3463 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -569,7 +569,7 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } -extern int fpu_alloc(struct fpu *fpu); +extern int fpstate_alloc(struct fpu *fpu); static inline void fpu_free(struct fpu *fpu) { diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 05fcc90087b0..5f2feb63b72a 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -246,17 +246,21 @@ void fpu_finit(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpu_finit); -int fpu_alloc(struct fpu *fpu) +int fpstate_alloc(struct fpu *fpu) { if (fpu->state) return 0; + fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); if (!fpu->state) return -ENOMEM; + + /* The CPU requires the FPU state to be aligned to 16 byte boundaries: */ WARN_ON((unsigned long)fpu->state & 15); + return 0; } -EXPORT_SYMBOL_GPL(fpu_alloc); +EXPORT_SYMBOL_GPL(fpstate_alloc); /* * Allocate the backing store for the current task's FPU registers @@ -276,7 +280,7 @@ int fpstate_alloc_init(struct task_struct *curr) /* * Memory allocation at the first usage of the FPU and other state. */ - ret = fpu_alloc(&curr->thread.fpu); + ret = fpstate_alloc(&curr->thread.fpu); if (ret) return ret; @@ -310,7 +314,7 @@ static int fpu__unlazy_stopped(struct task_struct *child) /* * Memory allocation at the first usage of the FPU and other state. */ - ret = fpu_alloc(&child->thread.fpu); + ret = fpstate_alloc(&child->thread.fpu); if (ret) return ret; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index abdb81d07423..a0ed7c06a12c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -92,7 +92,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.fpu.state = NULL; task_disable_lazy_fpu_restore(dst); if (tsk_used_math(src)) { - int err = fpu_alloc(&dst->thread.fpu); + int err = fpstate_alloc(&dst->thread.fpu); if (err) return err; fpu_copy(dst, src); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bfc396632ee8..28fa733bb1c6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7007,7 +7007,7 @@ int fx_init(struct kvm_vcpu *vcpu) { int err; - err = fpu_alloc(&vcpu->arch.guest_fpu); + err = fpstate_alloc(&vcpu->arch.guest_fpu); if (err) return err; -- cgit v1.2.1 From a7c2a83364c0d882a2d850c4c183c6de42d5c02b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 12:55:22 +0200 Subject: x86/fpu: Rename fpu_free() to fpstate_free() Use the fpu__*() namespace. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 88f584ab3463..b68e8f04c38a 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -571,7 +571,7 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) extern int fpstate_alloc(struct fpu *fpu); -static inline void fpu_free(struct fpu *fpu) +static inline void fpstate_free(struct fpu *fpu) { if (fpu->state) { kmem_cache_free(task_xstate_cachep, fpu->state); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index a0ed7c06a12c..fd4aa56335de 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -102,7 +102,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) void free_thread_xstate(struct task_struct *tsk) { - fpu_free(&tsk->thread.fpu); + fpstate_free(&tsk->thread.fpu); } void arch_release_task_struct(struct task_struct *tsk) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28fa733bb1c6..80a411c83083 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7029,7 +7029,7 @@ EXPORT_SYMBOL_GPL(fx_init); static void fx_free(struct kvm_vcpu *vcpu) { - fpu_free(&vcpu->arch.guest_fpu); + fpstate_free(&vcpu->arch.guest_fpu); } void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -- cgit v1.2.1 From c0ee2cf61be0bd3db2a30d76056a2e09fa48272e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 13:01:52 +0200 Subject: x86/fpu: Rename fpu_finit() to fpstate_init() Make it clear that we are initializing the in-memory FPU context area, no the FPU registers. Also move it to the fpu__*() namespace. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 2 +- arch/x86/kernel/i387.c | 8 ++++---- arch/x86/kernel/xsave.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 0367d17371f5..6552a16e0e38 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -19,8 +19,8 @@ struct pt_regs; struct user_i387_struct; extern int fpstate_alloc_init(struct task_struct *curr); +extern void fpstate_init(struct fpu *fpu); -extern void fpu_finit(struct fpu *fpu); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void math_state_restore(void); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 5f2feb63b72a..e0c16e86deb0 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -225,7 +225,7 @@ void fpu_init(void) eager_fpu_init(); } -void fpu_finit(struct fpu *fpu) +void fpstate_init(struct fpu *fpu) { if (!cpu_has_fpu) { finit_soft_fpu(&fpu->state->soft); @@ -244,7 +244,7 @@ void fpu_finit(struct fpu *fpu) fp->fos = 0xffff0000u; } } -EXPORT_SYMBOL_GPL(fpu_finit); +EXPORT_SYMBOL_GPL(fpstate_init); int fpstate_alloc(struct fpu *fpu) { @@ -284,7 +284,7 @@ int fpstate_alloc_init(struct task_struct *curr) if (ret) return ret; - fpu_finit(&curr->thread.fpu); + fpstate_init(&curr->thread.fpu); /* Safe to do for the current task: */ curr->flags |= PF_USED_MATH; @@ -318,7 +318,7 @@ static int fpu__unlazy_stopped(struct task_struct *child) if (ret) return ret; - fpu_finit(&child->thread.fpu); + fpstate_init(&child->thread.fpu); /* Safe to do for stopped child tasks: */ child->flags |= PF_USED_MATH; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a977cdd03825..163b5cc582ef 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -395,7 +395,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { - fpu_finit(fpu); + fpstate_init(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 80a411c83083..26b1f89fc608 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7011,7 +7011,7 @@ int fx_init(struct kvm_vcpu *vcpu) if (err) return err; - fpu_finit(&vcpu->arch.guest_fpu); + fpstate_init(&vcpu->arch.guest_fpu); if (cpu_has_xsaves) vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; -- cgit v1.2.1 From 3a9c4b0d7e1a693beeac24d749f6938444148e86 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 13:16:51 +0200 Subject: x86/fpu: Rename fpu_init() to fpu__cpu_init() fpu_init() is a bit of a misnomer in that it (falsely) creates the impression that it's related to the (old) fpu_finit() function, which initializes FPU ctx state. Rename it to fpu__cpu_init() to make its boot time initialization clear, and to move it to the fpu__*() namespace. Also fix and extend its comment block to point out that it's called not only on the boot CPU, but on secondary CPUs as well. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 +- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/i387.c | 10 ++++++---- arch/x86/xen/enlighten.c | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index b68e8f04c38a..02e0e97d8be7 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -39,7 +39,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, #endif extern unsigned int mxcsr_feature_mask; -extern void fpu_init(void); +extern void fpu__cpu_init(void); extern void eager_fpu_init(void); DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 60a29d290e2a..b8035b8dd186 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1439,7 +1439,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu_init(); + fpu__cpu_init(); if (is_uv_system()) uv_cpu_init(); @@ -1495,7 +1495,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu_init(); + fpu__cpu_init(); } #endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index e0c16e86deb0..5b4672584e65 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -183,11 +183,13 @@ static void init_thread_xstate(void) } /* - * Called at bootup to set up the initial FPU state that is later cloned - * into all processes. + * Called on the boot CPU at bootup to set up the initial FPU state that + * is later cloned into all processes. + * + * Also called on secondary CPUs to set up the FPU state of their + * idle threads. */ - -void fpu_init(void) +void fpu__cpu_init(void) { unsigned long cr0; unsigned long cr4_mask = 0; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 46957ead3060..43f8704b7289 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1423,7 +1423,7 @@ static void xen_pvh_set_cr_flags(int cpu) return; /* * For BSP, PSE PGE are set in probe_page_size_mask(), for APs - * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init. + * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__cpu_init(). */ if (cpu_has_pse) cr4_set_bits_and_update_boot(X86_CR4_PSE); -- cgit v1.2.1 From 3f6a0bce90289e0980b4250ccb03b765860247ee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Apr 2015 13:22:42 +0200 Subject: x86/fpu: Rename init_thread_xstate() to fpstate_xstate_init_size() So init_thread_xstate() is a misnomer in that it's not really related to a specific thread - it determines, once during initial bootup, the size of the xstate context. Also improve the comments. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 5b4672584e65..01101553c6c1 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -158,7 +158,7 @@ static void mxcsr_feature_mask_init(void) mxcsr_feature_mask &= mask; } -static void init_thread_xstate(void) +static void fpstate_xstate_init_size(void) { /* * Note that xstate_size might be overwriten later during @@ -216,11 +216,11 @@ void fpu__cpu_init(void) write_cr0(cr0); /* - * init_thread_xstate is only called once to avoid overriding - * xstate_size during boot time or during CPU hotplug. + * fpstate_xstate_init_size() is only called once, to avoid overriding + * 'xstate_size' during (secondary CPU) bootup or during CPU hotplug. */ if (xstate_size == 0) - init_thread_xstate(); + fpstate_xstate_init_size(); mxcsr_feature_mask_init(); xsave_init(); -- cgit v1.2.1 From c0c2803dee21bef08ef5aacdf96fe2f1759ccc62 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 09:52:56 +0200 Subject: x86/fpu: Move thread_info::fpu_counter into thread_info::fpu.counter This field is kept separate from the main FPU state structure for no good reason. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 10 +++++----- arch/x86/include/asm/processor.h | 18 +++++++++--------- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/traps.c | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 02e0e97d8be7..f85d21b68901 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -384,7 +384,7 @@ static inline void drop_fpu(struct task_struct *tsk) * Forget coprocessor state.. */ preempt_disable(); - tsk->thread.fpu_counter = 0; + tsk->thread.fpu.counter = 0; if (__thread_has_fpu(tsk)) { /* Ignore delayed exceptions from user space */ @@ -441,7 +441,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * or if the past 5 consecutive context-switches used math. */ fpu.preload = tsk_used_math(new) && - (use_eager_fpu() || new->thread.fpu_counter > 5); + (use_eager_fpu() || new->thread.fpu.counter > 5); if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) @@ -454,16 +454,16 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new->thread.fpu_counter++; + new->thread.fpu.counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); } else if (!use_eager_fpu()) stts(); } else { - old->thread.fpu_counter = 0; + old->thread.fpu.counter = 0; task_disable_lazy_fpu_restore(old); if (fpu.preload) { - new->thread.fpu_counter++; + new->thread.fpu.counter++; if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2dc08c231a9a..64d6b5d97ce9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -433,6 +433,15 @@ struct fpu { unsigned int last_cpu; unsigned int has_fpu; union thread_xstate *state; + /* + * This counter contains the number of consecutive context switches + * that the FPU is used. If this is over a threshold, the lazy fpu + * saving becomes unlazy to save the trap. This is an unsigned char + * so that after 256 times the counter wraps and the behavior turns + * lazy again; this to deal with bursty apps that only use FPU for + * a short time + */ + unsigned char counter; }; #ifdef CONFIG_X86_64 @@ -535,15 +544,6 @@ struct thread_struct { unsigned long iopl; /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; - /* - * fpu_counter contains the number of consecutive context switches - * that the FPU is used. If this is over a threshold, the lazy fpu - * saving becomes unlazy to save the trap. This is an unsigned char - * so that after 256 times the counter wraps and the behavior turns - * lazy again; this to deal with bursty apps that only use FPU for - * a short time - */ - unsigned char fpu_counter; }; /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fd4aa56335de..c7793addc237 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -87,7 +87,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; - dst->thread.fpu_counter = 0; + dst->thread.fpu.counter = 0; dst->thread.fpu.has_fpu = 0; dst->thread.fpu.state = NULL; task_disable_lazy_fpu_restore(dst); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index cf9c9627be19..231aa579d9cd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -863,7 +863,7 @@ void math_state_restore(void) fpu_reset_state(tsk); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); } else { - tsk->thread.fpu_counter++; + tsk->thread.fpu.counter++; } kernel_fpu_enable(); } -- cgit v1.2.1 From 126009993faa7a750835e67f3ccb90cee124ffa7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 09:57:24 +0200 Subject: x86/fpu: Improve the comment for the fpu::counter field This was pretty hard to read, improve it. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 64d6b5d97ce9..28df85561730 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -435,11 +435,11 @@ struct fpu { union thread_xstate *state; /* * This counter contains the number of consecutive context switches - * that the FPU is used. If this is over a threshold, the lazy fpu - * saving becomes unlazy to save the trap. This is an unsigned char - * so that after 256 times the counter wraps and the behavior turns - * lazy again; this to deal with bursty apps that only use FPU for - * a short time + * during which the FPU stays used. If this is over a threshold, the + * lazy fpu saving logic becomes unlazy, to save the trap overhead. + * This is an unsigned char so that after 256 iterations the counter + * wraps and the context switch behavior turns lazy again; this is to + * deal with bursty apps that only use the FPU for a short time: */ unsigned char counter; }; -- cgit v1.2.1 From 14b9675ae9c83c764c0c1fdf4b33f0e9156a4e4f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 09:57:24 +0200 Subject: x86/fpu: Move FPU data structures to asm/fpu_types.h Move the FPU details to asm/fpu_types.h, to further factor out the FPU code. ( As an added bonus, the 'struct orig_ist' definition now moves next to its other data types - the FPU definitions were slapped in the middle of them for some mysterious reason. ) No code changed. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 132 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 132 +-------------------------------------- 2 files changed, 133 insertions(+), 131 deletions(-) create mode 100644 arch/x86/include/asm/fpu/types.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h new file mode 100644 index 000000000000..e996023380d3 --- /dev/null +++ b/arch/x86/include/asm/fpu/types.h @@ -0,0 +1,132 @@ + +#define MXCSR_DEFAULT 0x1f80 + +struct i387_fsave_struct { + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE ]: */ + u32 status; +}; + +struct i387_fxsave_struct { + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ + union { + struct { + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ + }; + struct { + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ + }; + }; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; + +} __attribute__((aligned(16))); + +struct i387_soft_struct { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct math_emu_info *info; + u32 entry_eip; +}; + +struct ymmh_struct { + /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ + u32 ymmh_space[64]; +}; + +/* We don't support LWP yet: */ +struct lwp_struct { + u8 reserved[128]; +}; + +struct bndreg { + u64 lower_bound; + u64 upper_bound; +} __packed; + +struct bndcsr { + u64 bndcfgu; + u64 bndstatus; +} __packed; + +struct xsave_hdr_struct { + u64 xstate_bv; + u64 xcomp_bv; + u64 reserved[6]; +} __attribute__((packed)); + +struct xsave_struct { + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + struct ymmh_struct ymmh; + struct lwp_struct lwp; + struct bndreg bndreg[4]; + struct bndcsr bndcsr; + /* new processor state extensions will go here */ +} __attribute__ ((packed, aligned (64))); + +union thread_xstate { + struct i387_fsave_struct fsave; + struct i387_fxsave_struct fxsave; + struct i387_soft_struct soft; + struct xsave_struct xsave; +}; + +struct fpu { + unsigned int last_cpu; + unsigned int has_fpu; + union thread_xstate *state; + /* + * This counter contains the number of consecutive context switches + * during which the FPU stays used. If this is over a threshold, the + * lazy fpu saving logic becomes unlazy, to save the trap overhead. + * This is an unsigned char so that after 256 iterations the counter + * wraps and the context switch behavior turns lazy again; this is to + * deal with bursty apps that only use the FPU for a short time: + */ + unsigned char counter; +}; + diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 28df85561730..6b75c4b927ec 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -21,6 +21,7 @@ struct mm_struct; #include #include #include +#include #include #include @@ -313,137 +314,6 @@ struct orig_ist { unsigned long ist[7]; }; -#define MXCSR_DEFAULT 0x1f80 - -struct i387_fsave_struct { - u32 cwd; /* FPU Control Word */ - u32 swd; /* FPU Status Word */ - u32 twd; /* FPU Tag Word */ - u32 fip; /* FPU IP Offset */ - u32 fcs; /* FPU IP Selector */ - u32 foo; /* FPU Operand Pointer Offset */ - u32 fos; /* FPU Operand Pointer Selector */ - - /* 8*10 bytes for each FP-reg = 80 bytes: */ - u32 st_space[20]; - - /* Software status information [not touched by FSAVE ]: */ - u32 status; -}; - -struct i387_fxsave_struct { - u16 cwd; /* Control Word */ - u16 swd; /* Status Word */ - u16 twd; /* Tag Word */ - u16 fop; /* Last Instruction Opcode */ - union { - struct { - u64 rip; /* Instruction Pointer */ - u64 rdp; /* Data Pointer */ - }; - struct { - u32 fip; /* FPU IP Offset */ - u32 fcs; /* FPU IP Selector */ - u32 foo; /* FPU Operand Offset */ - u32 fos; /* FPU Operand Selector */ - }; - }; - u32 mxcsr; /* MXCSR Register State */ - u32 mxcsr_mask; /* MXCSR Mask */ - - /* 8*16 bytes for each FP-reg = 128 bytes: */ - u32 st_space[32]; - - /* 16*16 bytes for each XMM-reg = 256 bytes: */ - u32 xmm_space[64]; - - u32 padding[12]; - - union { - u32 padding1[12]; - u32 sw_reserved[12]; - }; - -} __attribute__((aligned(16))); - -struct i387_soft_struct { - u32 cwd; - u32 swd; - u32 twd; - u32 fip; - u32 fcs; - u32 foo; - u32 fos; - /* 8*10 bytes for each FP-reg = 80 bytes: */ - u32 st_space[20]; - u8 ftop; - u8 changed; - u8 lookahead; - u8 no_update; - u8 rm; - u8 alimit; - struct math_emu_info *info; - u32 entry_eip; -}; - -struct ymmh_struct { - /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ - u32 ymmh_space[64]; -}; - -/* We don't support LWP yet: */ -struct lwp_struct { - u8 reserved[128]; -}; - -struct bndreg { - u64 lower_bound; - u64 upper_bound; -} __packed; - -struct bndcsr { - u64 bndcfgu; - u64 bndstatus; -} __packed; - -struct xsave_hdr_struct { - u64 xstate_bv; - u64 xcomp_bv; - u64 reserved[6]; -} __attribute__((packed)); - -struct xsave_struct { - struct i387_fxsave_struct i387; - struct xsave_hdr_struct xsave_hdr; - struct ymmh_struct ymmh; - struct lwp_struct lwp; - struct bndreg bndreg[4]; - struct bndcsr bndcsr; - /* new processor state extensions will go here */ -} __attribute__ ((packed, aligned (64))); - -union thread_xstate { - struct i387_fsave_struct fsave; - struct i387_fxsave_struct fxsave; - struct i387_soft_struct soft; - struct xsave_struct xsave; -}; - -struct fpu { - unsigned int last_cpu; - unsigned int has_fpu; - union thread_xstate *state; - /* - * This counter contains the number of consecutive context switches - * during which the FPU stays used. If this is over a threshold, the - * lazy fpu saving logic becomes unlazy, to save the trap overhead. - * This is an unsigned char so that after 256 iterations the counter - * wraps and the context switch behavior turns lazy again; this is to - * deal with bursty apps that only use the FPU for a short time: - */ - unsigned char counter; -}; - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(struct orig_ist, orig_ist); -- cgit v1.2.1 From 47bc5106342b0c98ce5cc851173c201554d256d3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 10:17:06 +0200 Subject: x86/fpu: Clean up asm/fpu/types.h - add header guards - standardize vertical alignment - add comments about MPX No code changed. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 50 ++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index e996023380d3..efb520dcf38e 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -1,3 +1,8 @@ +/* + * FPU data structures: + */ +#ifndef _ASM_X86_FPU_H +#define _ASM_X86_FPU_H #define MXCSR_DEFAULT 0x1f80 @@ -52,6 +57,9 @@ struct i387_fxsave_struct { } __attribute__((aligned(16))); +/* + * Software based FPU emulation state: + */ struct i387_soft_struct { u32 cwd; u32 swd; @@ -74,38 +82,39 @@ struct i387_soft_struct { struct ymmh_struct { /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ - u32 ymmh_space[64]; + u32 ymmh_space[64]; }; /* We don't support LWP yet: */ struct lwp_struct { - u8 reserved[128]; + u8 reserved[128]; }; +/* Intel MPX support: */ struct bndreg { - u64 lower_bound; - u64 upper_bound; + u64 lower_bound; + u64 upper_bound; } __packed; struct bndcsr { - u64 bndcfgu; - u64 bndstatus; + u64 bndcfgu; + u64 bndstatus; } __packed; struct xsave_hdr_struct { - u64 xstate_bv; - u64 xcomp_bv; - u64 reserved[6]; + u64 xstate_bv; + u64 xcomp_bv; + u64 reserved[6]; } __attribute__((packed)); struct xsave_struct { - struct i387_fxsave_struct i387; - struct xsave_hdr_struct xsave_hdr; - struct ymmh_struct ymmh; - struct lwp_struct lwp; - struct bndreg bndreg[4]; - struct bndcsr bndcsr; - /* new processor state extensions will go here */ + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + struct ymmh_struct ymmh; + struct lwp_struct lwp; + struct bndreg bndreg[4]; + struct bndcsr bndcsr; + /* New processor state extensions will go here. */ } __attribute__ ((packed, aligned (64))); union thread_xstate { @@ -116,9 +125,9 @@ union thread_xstate { }; struct fpu { - unsigned int last_cpu; - unsigned int has_fpu; - union thread_xstate *state; + unsigned int last_cpu; + unsigned int has_fpu; + union thread_xstate *state; /* * This counter contains the number of consecutive context switches * during which the FPU stays used. If this is over a threshold, the @@ -127,6 +136,7 @@ struct fpu { * wraps and the context switch behavior turns lazy again; this is to * deal with bursty apps that only use the FPU for a short time: */ - unsigned char counter; + unsigned char counter; }; +#endif /* _ASM_X86_FPU_H */ -- cgit v1.2.1 From ce4c4c26241f9ab08f14b028d40736f319ed2445 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 10:39:11 +0200 Subject: x86/fpu: Move i387.c and xsave.c to arch/x86/kernel/fpu/ Create a new subdirectory for the FPU support code in arch/x86/kernel/fpu/. Rename 'i387.c' to 'core.c' - as this really collects the core FPU support code, nothing i387 specific. We'll better organize this directory in later patches. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/fpu/Makefile | 5 + arch/x86/kernel/fpu/core.c | 718 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/xsave.c | 724 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/i387.c | 718 ------------------------------------------ arch/x86/kernel/xsave.c | 724 ------------------------------------------- 6 files changed, 1448 insertions(+), 1443 deletions(-) create mode 100644 arch/x86/kernel/fpu/Makefile create mode 100644 arch/x86/kernel/fpu/core.c create mode 100644 arch/x86/kernel/fpu/xsave.c delete mode 100644 arch/x86/kernel/i387.c delete mode 100644 arch/x86/kernel/xsave.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9bcd0b56ca17..febaf180621b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,7 +44,7 @@ obj-y += pci-iommu_table.o obj-y += resource.o obj-y += process.o -obj-y += i387.o xsave.o +obj-y += fpu/ obj-y += ptrace.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile new file mode 100644 index 000000000000..89fd66a4b3a1 --- /dev/null +++ b/arch/x86/kernel/fpu/Makefile @@ -0,0 +1,5 @@ +# +# Build rules for the FPU support code: +# + +obj-y += core.o xsave.o diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c new file mode 100644 index 000000000000..01101553c6c1 --- /dev/null +++ b/arch/x86/kernel/fpu/core.c @@ -0,0 +1,718 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(bool, in_kernel_fpu); + +void kernel_fpu_disable(void) +{ + WARN_ON(this_cpu_read(in_kernel_fpu)); + this_cpu_write(in_kernel_fpu, true); +} + +void kernel_fpu_enable(void) +{ + this_cpu_write(in_kernel_fpu, false); +} + +/* + * Were we in an interrupt that interrupted kernel mode? + * + * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + * + * Except for the eagerfpu case when we return true; in the likely case + * the thread has FPU but we are not going to set/clear TS. + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + if (this_cpu_read(in_kernel_fpu)) + return false; + + if (use_eager_fpu()) + return true; + + return !__thread_has_fpu(current) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +void __kernel_fpu_begin(void) +{ + struct task_struct *me = current; + + this_cpu_write(in_kernel_fpu, true); + + if (__thread_has_fpu(me)) { + __save_init_fpu(me); + } else { + this_cpu_write(fpu_owner_task, NULL); + if (!use_eager_fpu()) + clts(); + } +} +EXPORT_SYMBOL(__kernel_fpu_begin); + +void __kernel_fpu_end(void) +{ + struct task_struct *me = current; + + if (__thread_has_fpu(me)) { + if (WARN_ON(restore_fpu_checking(me))) + fpu_reset_state(me); + } else if (!use_eager_fpu()) { + stts(); + } + + this_cpu_write(in_kernel_fpu, false); +} +EXPORT_SYMBOL(__kernel_fpu_end); + +/* + * Save the FPU state (initialize it if necessary): + * + * This only ever gets called for the current task. + */ +void fpu__save(struct task_struct *tsk) +{ + WARN_ON(tsk != current); + + preempt_disable(); + if (__thread_has_fpu(tsk)) { + if (use_eager_fpu()) { + __save_fpu(tsk); + } else { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(fpu__save); + +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +unsigned int xstate_size; +EXPORT_SYMBOL_GPL(xstate_size); +static struct i387_fxsave_struct fx_scratch; + +static void mxcsr_feature_mask_init(void) +{ + unsigned long mask = 0; + + if (cpu_has_fxsr) { + memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : "+m" (fx_scratch)); + mask = fx_scratch.mxcsr_mask; + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; +} + +static void fpstate_xstate_init_size(void) +{ + /* + * Note that xstate_size might be overwriten later during + * xsave_init(). + */ + + if (!cpu_has_fpu) { + /* + * Disable xsave as we do not support it if i387 + * emulation is enabled. + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + xstate_size = sizeof(struct i387_soft_struct); + return; + } + + if (cpu_has_fxsr) + xstate_size = sizeof(struct i387_fxsave_struct); + else + xstate_size = sizeof(struct i387_fsave_struct); +} + +/* + * Called on the boot CPU at bootup to set up the initial FPU state that + * is later cloned into all processes. + * + * Also called on secondary CPUs to set up the FPU state of their + * idle threads. + */ +void fpu__cpu_init(void) +{ + unsigned long cr0; + unsigned long cr4_mask = 0; + +#ifndef CONFIG_MATH_EMULATION + if (!cpu_has_fpu) { + pr_emerg("No FPU found and no math emulation present\n"); + pr_emerg("Giving up\n"); + for (;;) + asm volatile("hlt"); + } +#endif + if (cpu_has_fxsr) + cr4_mask |= X86_CR4_OSFXSR; + if (cpu_has_xmm) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + cr4_set_bits(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!cpu_has_fpu) + cr0 |= X86_CR0_EM; + write_cr0(cr0); + + /* + * fpstate_xstate_init_size() is only called once, to avoid overriding + * 'xstate_size' during (secondary CPU) bootup or during CPU hotplug. + */ + if (xstate_size == 0) + fpstate_xstate_init_size(); + + mxcsr_feature_mask_init(); + xsave_init(); + eager_fpu_init(); +} + +void fpstate_init(struct fpu *fpu) +{ + if (!cpu_has_fpu) { + finit_soft_fpu(&fpu->state->soft); + return; + } + + memset(fpu->state, 0, xstate_size); + + if (cpu_has_fxsr) { + fx_finit(&fpu->state->fxsave); + } else { + struct i387_fsave_struct *fp = &fpu->state->fsave; + fp->cwd = 0xffff037fu; + fp->swd = 0xffff0000u; + fp->twd = 0xffffffffu; + fp->fos = 0xffff0000u; + } +} +EXPORT_SYMBOL_GPL(fpstate_init); + +int fpstate_alloc(struct fpu *fpu) +{ + if (fpu->state) + return 0; + + fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); + if (!fpu->state) + return -ENOMEM; + + /* The CPU requires the FPU state to be aligned to 16 byte boundaries: */ + WARN_ON((unsigned long)fpu->state & 15); + + return 0; +} +EXPORT_SYMBOL_GPL(fpstate_alloc); + +/* + * Allocate the backing store for the current task's FPU registers + * and initialize the registers themselves as well. + * + * Can fail. + */ +int fpstate_alloc_init(struct task_struct *curr) +{ + int ret; + + if (WARN_ON_ONCE(curr != current)) + return -EINVAL; + if (WARN_ON_ONCE(curr->flags & PF_USED_MATH)) + return -EINVAL; + + /* + * Memory allocation at the first usage of the FPU and other state. + */ + ret = fpstate_alloc(&curr->thread.fpu); + if (ret) + return ret; + + fpstate_init(&curr->thread.fpu); + + /* Safe to do for the current task: */ + curr->flags |= PF_USED_MATH; + + return 0; +} +EXPORT_SYMBOL_GPL(fpstate_alloc_init); + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remember the current task has used the FPU. + */ +static int fpu__unlazy_stopped(struct task_struct *child) +{ + int ret; + + if (WARN_ON_ONCE(child == current)) + return -EINVAL; + + if (child->flags & PF_USED_MATH) { + task_disable_lazy_fpu_restore(child); + return 0; + } + + /* + * Memory allocation at the first usage of the FPU and other state. + */ + ret = fpstate_alloc(&child->thread.fpu); + if (ret) + return ret; + + fpstate_init(&child->thread.fpu); + + /* Safe to do for stopped child tasks: */ + child->flags |= PF_USED_MATH; + + return 0; +} + +/* + * The xstateregs_active() routine is the same as the fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ +int fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return tsk_used_math(target) ? regset->n : 0; +} + +int xfpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0; +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + ret = fpu__unlazy_stopped(target); + if (ret) + return ret; + + sanitize_i387_state(target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->fxsave, 0, -1); +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + ret = fpu__unlazy_stopped(target); + if (ret) + return ret; + + sanitize_i387_state(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->fxsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; + + /* + * update the header bits in the xsave header, indicating the + * presence of FP and SSE state. + */ + if (cpu_has_xsave) + target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + + return ret; +} + +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct xsave_struct *xsave; + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = fpu__unlazy_stopped(target); + if (ret) + return ret; + + xsave = &target->thread.fpu.state->xsave; + + /* + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). + */ + memcpy(&xsave->i387.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct xsave_struct *xsave; + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = fpu__unlazy_stopped(target); + if (ret) + return ret; + + xsave = &target->thread.fpu.state->xsave; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + xsave->xsave_hdr.xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + memset(&xsave->xsave_hdr.reserved, 0, 48); + return ret; +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +void +convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; + if (tsk == current) { + savesegment(ds, env->fos); + } else { + env->fos = tsk->thread.ds; + } + env->fos |= 0xffff0000; +#else + env->fip = fxsave->fip; + env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) + +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct user_i387_ia32_struct env; + int ret; + + ret = fpu__unlazy_stopped(target); + if (ret) + return ret; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + + if (!cpu_has_fxsr) + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->fsave, 0, + -1); + + sanitize_i387_state(target); + + if (kbuf && pos == 0 && count == sizeof(env)) { + convert_from_fxsr(kbuf, target); + return 0; + } + + convert_from_fxsr(&env, target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_i387_ia32_struct env; + int ret; + + ret = fpu__unlazy_stopped(target); + if (ret) + return ret; + + sanitize_i387_state(target); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + if (!cpu_has_fxsr) + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->fsave, 0, + -1); + + if (pos > 0 || count < sizeof(env)) + convert_from_fxsr(&env, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (!ret) + convert_to_fxsr(target, &env); + + /* + * update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (cpu_has_xsave) + target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; + return ret; +} + +/* + * FPU state for core dumps. + * This is only used for a.out dumps now. + * It is declared generically using elf_fpregset_t (which is + * struct user_i387_struct) but is in fact only used for 32-bit + * dumps, so on 64-bit it is really struct user_i387_ia32_struct. + */ +int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) +{ + struct task_struct *tsk = current; + int fpvalid; + + fpvalid = !!used_math(); + if (fpvalid) + fpvalid = !fpregs_get(tsk, NULL, + 0, sizeof(struct user_i387_ia32_struct), + fpu, NULL); + + return fpvalid; +} +EXPORT_SYMBOL(dump_fpu); + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ + +static int __init no_387(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FPU); + return 1; +} + +__setup("no387", no_387); + +/* + * Set the X86_FEATURE_FPU CPU-capability bit based on + * trying to execute an actual sequence of FPU instructions: + */ +void fpu__detect(struct cpuinfo_x86 *c) +{ + unsigned long cr0; + u16 fsw, fcw; + + fsw = fcw = 0xffff; + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS | X86_CR0_EM); + write_cr0(cr0); + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); + + if (fsw == 0 && (fcw & 0x103f) == 0x003f) + set_cpu_cap(c, X86_FEATURE_FPU); + else + clear_cpu_cap(c, X86_FEATURE_FPU); + + /* The final cr0 value is set in fpu_init() */ +} diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c new file mode 100644 index 000000000000..163b5cc582ef --- /dev/null +++ b/arch/x86/kernel/fpu/xsave.c @@ -0,0 +1,724 @@ +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Supported feature mask by the CPU and the kernel. + */ +u64 pcntxt_mask; + +/* + * Represents init state for the supported extended state. + */ +struct xsave_struct *init_xstate_buf; + +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; +static unsigned int *xstate_offsets, *xstate_sizes; +static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8]; +static unsigned int xstate_features; + +/* + * If a processor implementation discern that a processor state component is + * in its initialized state it may modify the corresponding bit in the + * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * layout in the case of xsaveopt. While presenting the xstate information to + * the user, we always ensure that the memory layout of a feature will be in + * the init state if the corresponding header bit is zero. This is to ensure + * that the user doesn't see some stale state in the memory layout during + * signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + int feature_bit = 0x2; + u64 xstate_bv; + + if (!fx) + return; + + xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is up to date. + */ + if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + return; + + /* + * FP is in init state + */ + if (!(xstate_bv & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xstate_bv & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + + /* + * Update all the other memory layouts for which the corresponding + * header bit is in the init state. + */ + while (xstate_bv) { + if (xstate_bv & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy(((void *) fx) + offset, + ((void *) init_xstate_buf) + offset, + size); + } + + xstate_bv >>= 1; + feature_bit++; + } +} + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw) +{ + int min_xstate_size = sizeof(struct i387_fxsave_struct) + + sizeof(struct xsave_hdr_struct); + unsigned int magic2; + + if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) + return -1; + + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fx_sw->extended_size) + return -1; + + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) + || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + +/* + * Signal frame handlers. + */ +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +{ + if (use_fxsr()) { + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct user_i387_ia32_struct env; + struct _fpstate_ia32 __user *fp = buf; + + convert_from_fxsr(&env, tsk); + + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return -1; + } else { + struct i387_fsave_struct __user *fp = buf; + u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) + return -1; + } + + return 0; +} + +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +{ + struct xsave_struct __user *x = buf; + struct _fpx_sw_bytes *sw_bytes; + u32 xstate_bv; + int err; + + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; + err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + + if (!use_xsave()) + return err; + + err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + + /* + * Read the xstate_bv which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware apps can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xstate_bv |= XSTATE_FPSSE; + + err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); + + return err; +} + +static inline int save_user_xstate(struct xsave_struct __user *buf) +{ + int err; + + if (use_xsave()) + err = xsave_user(buf); + else if (use_fxsr()) + err = fxsave_user((struct i387_fxsave_struct __user *) buf); + else + err = fsave_user((struct i387_fsave_struct __user *) buf); + + if (unlikely(err) && __clear_user(buf, xstate_size)) + err = -EFAULT; + return err; +} + +/* + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * If the fpu, extended register state is live, save the state directly + * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, + * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. + */ +int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ + struct xsave_struct *xsave = ¤t->thread.fpu.state->xsave; + struct task_struct *tsk = current; + int ia32_fxstate = (buf != buf_fx); + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!access_ok(VERIFY_WRITE, buf, size)) + return -EACCES; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), NULL, + (struct _fpstate_ia32 __user *) buf) ? -1 : 1; + + if (user_has_fpu()) { + /* Save the live register state to the user directly. */ + if (save_user_xstate(buf_fx)) + return -1; + /* Update the thread's fxstate to save the fsave header. */ + if (ia32_fxstate) + fpu_fxsave(&tsk->thread.fpu); + } else { + sanitize_i387_state(tsk); + if (__copy_to_user(buf_fx, xsave, xstate_size)) + return -1; + } + + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + return -1; + + if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + return -1; + + return 0; +} + +static inline void +sanitize_restored_xstate(struct task_struct *tsk, + struct user_i387_ia32_struct *ia32_env, + u64 xstate_bv, int fx_only) +{ + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr; + + if (use_xsave()) { + /* These bits must be zero. */ + memset(xsave_hdr->reserved, 0, 48); + + /* + * Init the state that is not present in the memory + * layout and not enabled by the OS. + */ + if (fx_only) + xsave_hdr->xstate_bv = XSTATE_FPSSE; + else + xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv); + } + + if (use_fxsr()) { + /* + * mscsr reserved bits must be masked to zero for security + * reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + + convert_to_fxsr(tsk, ia32_env); + } +} + +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE state. + */ +static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) +{ + if (use_xsave()) { + if ((unsigned long)buf % 64 || fx_only) { + u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; + xrstor_state(init_xstate_buf, init_bv); + return fxrstor_user(buf); + } else { + u64 init_bv = pcntxt_mask & ~xbv; + if (unlikely(init_bv)) + xrstor_state(init_xstate_buf, init_bv); + return xrestore_user(buf, xbv); + } + } else if (use_fxsr()) { + return fxrstor_user(buf); + } else + return frstor_user(buf); +} + +int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ + int ia32_fxstate = (buf != buf_fx); + struct task_struct *tsk = current; + int state_size = xstate_size; + u64 xstate_bv = 0; + int fx_only = 0; + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!buf) { + fpu_reset_state(tsk); + return 0; + } + + if (!access_ok(VERIFY_READ, buf, size)) + return -EACCES; + + if (!used_math() && fpstate_alloc_init(tsk)) + return -1; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { + /* + * Couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + state_size = sizeof(struct i387_fxsave_struct); + fx_only = 1; + } else { + state_size = fx_sw_user.xstate_size; + xstate_bv = fx_sw_user.xstate_bv; + } + } + + if (ia32_fxstate) { + /* + * For 32-bit frames with fxstate, copy the user state to the + * thread's fpu state, reconstruct fxstate from the fsave + * header. Sanitize the copied state etc. + */ + struct fpu *fpu = &tsk->thread.fpu; + struct user_i387_ia32_struct env; + int err = 0; + + /* + * Drop the current fpu which clears used_math(). This ensures + * that any context-switch during the copy of the new state, + * avoids the intermediate state from getting restored/saved. + * Thus avoiding the new restored state from getting corrupted. + * We will be ready to restore/save the state only after + * set_used_math() is again set. + */ + drop_fpu(tsk); + + if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || + __copy_from_user(&env, buf, sizeof(env))) { + fpstate_init(fpu); + err = -1; + } else { + sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); + } + + set_used_math(); + if (use_eager_fpu()) { + preempt_disable(); + math_state_restore(); + preempt_enable(); + } + + return err; + } else { + /* + * For 64-bit frames and 32-bit fsave frames, restore the user + * state to the registers directly (with exceptions handled). + */ + user_fpu_begin(); + if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { + fpu_reset_state(tsk); + return -1; + } + } + + return 0; +} + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static void prepare_fx_sw_frame(void) +{ + int fsave_header_size = sizeof(struct i387_fsave_struct); + int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + + if (config_enabled(CONFIG_X86_32)) + size += fsave_header_size; + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = size; + fx_sw_reserved.xstate_bv = pcntxt_mask; + fx_sw_reserved.xstate_size = xstate_size; + + if (config_enabled(CONFIG_IA32_EMULATION)) { + fx_sw_reserved_ia32 = fx_sw_reserved; + fx_sw_reserved_ia32.extended_size += fsave_header_size; + } +} + +/* + * Enable the extended processor state save/restore feature + */ +static inline void xstate_enable(void) +{ + cr4_set_bits(X86_CR4_OSXSAVE); + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); +} + +/* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void __init setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xstate_features = fls64(pcntxt_mask); + xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); + xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + + do { + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + +/* + * This function sets up offsets and sizes of all extended states in + * xsave area. This supports both standard format and compacted format + * of the xsave aread. + * + * Input: void + * Output: void + */ +void setup_xstate_comp(void) +{ + unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8]; + int i; + + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_comp_offsets[0] = 0; + xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); + + if (!cpu_has_xsaves) { + for (i = 2; i < xstate_features; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) { + xstate_comp_offsets[i] = xstate_offsets[i]; + xstate_comp_sizes[i] = xstate_sizes[i]; + } + } + return; + } + + xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = 2; i < xstate_features; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) + xstate_comp_sizes[i] = xstate_sizes[i]; + else + xstate_comp_sizes[i] = 0; + + if (i > 2) + xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + + xstate_comp_sizes[i-1]; + + } +} + +/* + * setup the xstate image representing the init state + */ +static void __init setup_init_fpu_buf(void) +{ + /* + * Setup init_xstate_buf to represent the init state of + * all the features managed by the xsave + */ + init_xstate_buf = alloc_bootmem_align(xstate_size, + __alignof__(struct xsave_struct)); + fx_finit(&init_xstate_buf->i387); + + if (!cpu_has_xsave) + return; + + setup_xstate_features(); + + if (cpu_has_xsaves) { + init_xstate_buf->xsave_hdr.xcomp_bv = + (u64)1 << 63 | pcntxt_mask; + init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask; + } + + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state_booting(init_xstate_buf, -1); + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state_booting(init_xstate_buf, -1); +} + +static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; +static int __init eager_fpu_setup(char *s) +{ + if (!strcmp(s, "on")) + eagerfpu = ENABLE; + else if (!strcmp(s, "off")) + eagerfpu = DISABLE; + else if (!strcmp(s, "auto")) + eagerfpu = AUTO; + return 1; +} +__setup("eagerfpu=", eager_fpu_setup); + + +/* + * Calculate total size of enabled xstates in XCR0/pcntxt_mask. + */ +static void __init init_xstate_size(void) +{ + unsigned int eax, ebx, ecx, edx; + int i; + + if (!cpu_has_xsaves) { + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xstate_size = ebx; + return; + } + + xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for (i = 2; i < 64; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_size += eax; + } + } +} + +/* + * Enable and initialize the xsave feature. + */ +static void __init xstate_enable_boot_cpu(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + pcntxt_mask = eax + ((u64)edx << 32); + + if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + pr_err("FP/SSE not shown under xsave features 0x%llx\n", + pcntxt_mask); + BUG(); + } + + /* + * Support only the state known to OS. + */ + pcntxt_mask = pcntxt_mask & XCNTXT_MASK; + + xstate_enable(); + + /* + * Recompute the context size for enabled features + */ + init_xstate_size(); + + update_regset_xstate_info(xstate_size, pcntxt_mask); + prepare_fx_sw_frame(); + setup_init_fpu_buf(); + + /* Auto enable eagerfpu for xsaveopt */ + if (cpu_has_xsaveopt && eagerfpu != DISABLE) + eagerfpu = ENABLE; + + if (pcntxt_mask & XSTATE_EAGER) { + if (eagerfpu == DISABLE) { + pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n", + pcntxt_mask & XSTATE_EAGER); + pcntxt_mask &= ~XSTATE_EAGER; + } else { + eagerfpu = ENABLE; + } + } + + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n", + pcntxt_mask, xstate_size, + cpu_has_xsaves ? "compacted form" : "standard form"); +} + +/* + * For the very first instance, this calls xstate_enable_boot_cpu(); + * for all subsequent instances, this calls xstate_enable(). + * + * This is somewhat obfuscated due to the lack of powerful enough + * overrides for the section checks. + */ +void xsave_init(void) +{ + static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; + void (*this_func)(void); + + if (!cpu_has_xsave) + return; + + this_func = next_func; + next_func = xstate_enable; + this_func(); +} + +/* + * setup_init_fpu_buf() is __init and it is OK to call it here because + * init_xstate_buf will be unset only once during boot. + */ +void __init_refok eager_fpu_init(void) +{ + WARN_ON(used_math()); + current_thread_info()->status = 0; + + if (eagerfpu == ENABLE) + setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + + if (!cpu_has_eager_fpu) { + stts(); + return; + } + + if (!init_xstate_buf) + setup_init_fpu_buf(); +} + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Inputs: + * xsave: base address of the xsave area; + * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, + * etc.) + * Output: + * address of the state in the xsave area. + */ +void *get_xsave_addr(struct xsave_struct *xsave, int xstate) +{ + int feature = fls64(xstate) - 1; + if (!test_bit(feature, (unsigned long *)&pcntxt_mask)) + return NULL; + + return (void *)xsave + xstate_comp_offsets[feature]; +} +EXPORT_SYMBOL_GPL(get_xsave_addr); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c deleted file mode 100644 index 01101553c6c1..000000000000 --- a/arch/x86/kernel/i387.c +++ /dev/null @@ -1,718 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes , May 2000 - */ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_PER_CPU(bool, in_kernel_fpu); - -void kernel_fpu_disable(void) -{ - WARN_ON(this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, true); -} - -void kernel_fpu_enable(void) -{ - this_cpu_write(in_kernel_fpu, false); -} - -/* - * Were we in an interrupt that interrupted kernel mode? - * - * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - * - * Except for the eagerfpu case when we return true; in the likely case - * the thread has FPU but we are not going to set/clear TS. - */ -static inline bool interrupted_kernel_fpu_idle(void) -{ - if (this_cpu_read(in_kernel_fpu)) - return false; - - if (use_eager_fpu()) - return true; - - return !__thread_has_fpu(current) && - (read_cr0() & X86_CR0_TS); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static inline bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); -} - -/* - * Can we use the FPU in kernel mode with the - * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. - */ -bool irq_fpu_usable(void) -{ - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); -} -EXPORT_SYMBOL(irq_fpu_usable); - -void __kernel_fpu_begin(void) -{ - struct task_struct *me = current; - - this_cpu_write(in_kernel_fpu, true); - - if (__thread_has_fpu(me)) { - __save_init_fpu(me); - } else { - this_cpu_write(fpu_owner_task, NULL); - if (!use_eager_fpu()) - clts(); - } -} -EXPORT_SYMBOL(__kernel_fpu_begin); - -void __kernel_fpu_end(void) -{ - struct task_struct *me = current; - - if (__thread_has_fpu(me)) { - if (WARN_ON(restore_fpu_checking(me))) - fpu_reset_state(me); - } else if (!use_eager_fpu()) { - stts(); - } - - this_cpu_write(in_kernel_fpu, false); -} -EXPORT_SYMBOL(__kernel_fpu_end); - -/* - * Save the FPU state (initialize it if necessary): - * - * This only ever gets called for the current task. - */ -void fpu__save(struct task_struct *tsk) -{ - WARN_ON(tsk != current); - - preempt_disable(); - if (__thread_has_fpu(tsk)) { - if (use_eager_fpu()) { - __save_fpu(tsk); - } else { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } - } - preempt_enable(); -} -EXPORT_SYMBOL_GPL(fpu__save); - -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; -unsigned int xstate_size; -EXPORT_SYMBOL_GPL(xstate_size); -static struct i387_fxsave_struct fx_scratch; - -static void mxcsr_feature_mask_init(void) -{ - unsigned long mask = 0; - - if (cpu_has_fxsr) { - memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : "+m" (fx_scratch)); - mask = fx_scratch.mxcsr_mask; - if (mask == 0) - mask = 0x0000ffbf; - } - mxcsr_feature_mask &= mask; -} - -static void fpstate_xstate_init_size(void) -{ - /* - * Note that xstate_size might be overwriten later during - * xsave_init(). - */ - - if (!cpu_has_fpu) { - /* - * Disable xsave as we do not support it if i387 - * emulation is enabled. - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - xstate_size = sizeof(struct i387_soft_struct); - return; - } - - if (cpu_has_fxsr) - xstate_size = sizeof(struct i387_fxsave_struct); - else - xstate_size = sizeof(struct i387_fsave_struct); -} - -/* - * Called on the boot CPU at bootup to set up the initial FPU state that - * is later cloned into all processes. - * - * Also called on secondary CPUs to set up the FPU state of their - * idle threads. - */ -void fpu__cpu_init(void) -{ - unsigned long cr0; - unsigned long cr4_mask = 0; - -#ifndef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) { - pr_emerg("No FPU found and no math emulation present\n"); - pr_emerg("Giving up\n"); - for (;;) - asm volatile("hlt"); - } -#endif - if (cpu_has_fxsr) - cr4_mask |= X86_CR4_OSFXSR; - if (cpu_has_xmm) - cr4_mask |= X86_CR4_OSXMMEXCPT; - if (cr4_mask) - cr4_set_bits(cr4_mask); - - cr0 = read_cr0(); - cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ - if (!cpu_has_fpu) - cr0 |= X86_CR0_EM; - write_cr0(cr0); - - /* - * fpstate_xstate_init_size() is only called once, to avoid overriding - * 'xstate_size' during (secondary CPU) bootup or during CPU hotplug. - */ - if (xstate_size == 0) - fpstate_xstate_init_size(); - - mxcsr_feature_mask_init(); - xsave_init(); - eager_fpu_init(); -} - -void fpstate_init(struct fpu *fpu) -{ - if (!cpu_has_fpu) { - finit_soft_fpu(&fpu->state->soft); - return; - } - - memset(fpu->state, 0, xstate_size); - - if (cpu_has_fxsr) { - fx_finit(&fpu->state->fxsave); - } else { - struct i387_fsave_struct *fp = &fpu->state->fsave; - fp->cwd = 0xffff037fu; - fp->swd = 0xffff0000u; - fp->twd = 0xffffffffu; - fp->fos = 0xffff0000u; - } -} -EXPORT_SYMBOL_GPL(fpstate_init); - -int fpstate_alloc(struct fpu *fpu) -{ - if (fpu->state) - return 0; - - fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); - if (!fpu->state) - return -ENOMEM; - - /* The CPU requires the FPU state to be aligned to 16 byte boundaries: */ - WARN_ON((unsigned long)fpu->state & 15); - - return 0; -} -EXPORT_SYMBOL_GPL(fpstate_alloc); - -/* - * Allocate the backing store for the current task's FPU registers - * and initialize the registers themselves as well. - * - * Can fail. - */ -int fpstate_alloc_init(struct task_struct *curr) -{ - int ret; - - if (WARN_ON_ONCE(curr != current)) - return -EINVAL; - if (WARN_ON_ONCE(curr->flags & PF_USED_MATH)) - return -EINVAL; - - /* - * Memory allocation at the first usage of the FPU and other state. - */ - ret = fpstate_alloc(&curr->thread.fpu); - if (ret) - return ret; - - fpstate_init(&curr->thread.fpu); - - /* Safe to do for the current task: */ - curr->flags |= PF_USED_MATH; - - return 0; -} -EXPORT_SYMBOL_GPL(fpstate_alloc_init); - -/* - * The _current_ task is using the FPU for the first time - * so initialize it and set the mxcsr to its default - * value at reset if we support XMM instructions and then - * remember the current task has used the FPU. - */ -static int fpu__unlazy_stopped(struct task_struct *child) -{ - int ret; - - if (WARN_ON_ONCE(child == current)) - return -EINVAL; - - if (child->flags & PF_USED_MATH) { - task_disable_lazy_fpu_restore(child); - return 0; - } - - /* - * Memory allocation at the first usage of the FPU and other state. - */ - ret = fpstate_alloc(&child->thread.fpu); - if (ret) - return ret; - - fpstate_init(&child->thread.fpu); - - /* Safe to do for stopped child tasks: */ - child->flags |= PF_USED_MATH; - - return 0; -} - -/* - * The xstateregs_active() routine is the same as the fpregs_active() routine, - * as the "regset->n" for the xstate regset will be updated based on the feature - * capabilites supported by the xsave. - */ -int fpregs_active(struct task_struct *target, const struct user_regset *regset) -{ - return tsk_used_math(target) ? regset->n : 0; -} - -int xfpregs_active(struct task_struct *target, const struct user_regset *regset) -{ - return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0; -} - -int xfpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - if (!cpu_has_fxsr) - return -ENODEV; - - ret = fpu__unlazy_stopped(target); - if (ret) - return ret; - - sanitize_i387_state(target); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fxsave, 0, -1); -} - -int xfpregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - if (!cpu_has_fxsr) - return -ENODEV; - - ret = fpu__unlazy_stopped(target); - if (ret) - return ret; - - sanitize_i387_state(target); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fxsave, 0, -1); - - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; - - /* - * update the header bits in the xsave header, indicating the - * presence of FP and SSE state. - */ - if (cpu_has_xsave) - target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; - - return ret; -} - -int xstateregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - struct xsave_struct *xsave; - int ret; - - if (!cpu_has_xsave) - return -ENODEV; - - ret = fpu__unlazy_stopped(target); - if (ret) - return ret; - - xsave = &target->thread.fpu.state->xsave; - - /* - * Copy the 48bytes defined by the software first into the xstate - * memory layout in the thread struct, so that we can copy the entire - * xstateregs to the user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - /* - * Copy the xstate memory layout. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - return ret; -} - -int xstateregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - struct xsave_struct *xsave; - int ret; - - if (!cpu_has_xsave) - return -ENODEV; - - ret = fpu__unlazy_stopped(target); - if (ret) - return ret; - - xsave = &target->thread.fpu.state->xsave; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->xsave_hdr.xstate_bv &= pcntxt_mask; - /* - * These bits must be zero. - */ - memset(&xsave->xsave_hdr.reserved, 0, 48); - return ret; -} - -#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION - -/* - * FPU tag word conversions. - */ - -static inline unsigned short twd_i387_to_fxsr(unsigned short twd) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - - return tmp; -} - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) -#define FP_EXP_TAG_VALID 0 -#define FP_EXP_TAG_ZERO 1 -#define FP_EXP_TAG_SPECIAL 2 -#define FP_EXP_TAG_EMPTY 3 - -static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) -{ - struct _fpxreg *st; - u32 tos = (fxsave->swd >> 11) & 7; - u32 twd = (unsigned long) fxsave->twd; - u32 tag; - u32 ret = 0xffff0000u; - int i; - - for (i = 0; i < 8; i++, twd >>= 1) { - if (twd & 0x1) { - st = FPREG_ADDR(fxsave, (i - tos) & 7); - - switch (st->exponent & 0x7fff) { - case 0x7fff: - tag = FP_EXP_TAG_SPECIAL; - break; - case 0x0000: - if (!st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3]) - tag = FP_EXP_TAG_ZERO; - else - tag = FP_EXP_TAG_SPECIAL; - break; - default: - if (st->significand[3] & 0x8000) - tag = FP_EXP_TAG_VALID; - else - tag = FP_EXP_TAG_SPECIAL; - break; - } - } else { - tag = FP_EXP_TAG_EMPTY; - } - ret |= tag << (2 * i); - } - return ret; -} - -/* - * FXSR floating point environment conversions. - */ - -void -convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) -{ - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; - struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; - struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; - int i; - - env->cwd = fxsave->cwd | 0xffff0000u; - env->swd = fxsave->swd | 0xffff0000u; - env->twd = twd_fxsr_to_i387(fxsave); - -#ifdef CONFIG_X86_64 - env->fip = fxsave->rip; - env->foo = fxsave->rdp; - /* - * should be actually ds/cs at fpu exception time, but - * that information is not available in 64bit mode. - */ - env->fcs = task_pt_regs(tsk)->cs; - if (tsk == current) { - savesegment(ds, env->fos); - } else { - env->fos = tsk->thread.ds; - } - env->fos |= 0xffff0000; -#else - env->fip = fxsave->fip; - env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); - env->foo = fxsave->foo; - env->fos = fxsave->fos; -#endif - - for (i = 0; i < 8; ++i) - memcpy(&to[i], &from[i], sizeof(to[0])); -} - -void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env) - -{ - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; - struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; - struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; - int i; - - fxsave->cwd = env->cwd; - fxsave->swd = env->swd; - fxsave->twd = twd_i387_to_fxsr(env->twd); - fxsave->fop = (u16) ((u32) env->fcs >> 16); -#ifdef CONFIG_X86_64 - fxsave->rip = env->fip; - fxsave->rdp = env->foo; - /* cs and ds ignored */ -#else - fxsave->fip = env->fip; - fxsave->fcs = (env->fcs & 0xffff); - fxsave->foo = env->foo; - fxsave->fos = env->fos; -#endif - - for (i = 0; i < 8; ++i) - memcpy(&to[i], &from[i], sizeof(from[0])); -} - -int fpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - struct user_i387_ia32_struct env; - int ret; - - ret = fpu__unlazy_stopped(target); - if (ret) - return ret; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - - if (!cpu_has_fxsr) - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fsave, 0, - -1); - - sanitize_i387_state(target); - - if (kbuf && pos == 0 && count == sizeof(env)) { - convert_from_fxsr(kbuf, target); - return 0; - } - - convert_from_fxsr(&env, target); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); -} - -int fpregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - struct user_i387_ia32_struct env; - int ret; - - ret = fpu__unlazy_stopped(target); - if (ret) - return ret; - - sanitize_i387_state(target); - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - - if (!cpu_has_fxsr) - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fsave, 0, - -1); - - if (pos > 0 || count < sizeof(env)) - convert_from_fxsr(&env, target); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); - if (!ret) - convert_to_fxsr(target, &env); - - /* - * update the header bit in the xsave header, indicating the - * presence of FP. - */ - if (cpu_has_xsave) - target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; - return ret; -} - -/* - * FPU state for core dumps. - * This is only used for a.out dumps now. - * It is declared generically using elf_fpregset_t (which is - * struct user_i387_struct) but is in fact only used for 32-bit - * dumps, so on 64-bit it is really struct user_i387_ia32_struct. - */ -int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) -{ - struct task_struct *tsk = current; - int fpvalid; - - fpvalid = !!used_math(); - if (fpvalid) - fpvalid = !fpregs_get(tsk, NULL, - 0, sizeof(struct user_i387_ia32_struct), - fpu, NULL); - - return fpvalid; -} -EXPORT_SYMBOL(dump_fpu); - -#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ - -static int __init no_387(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FPU); - return 1; -} - -__setup("no387", no_387); - -/* - * Set the X86_FEATURE_FPU CPU-capability bit based on - * trying to execute an actual sequence of FPU instructions: - */ -void fpu__detect(struct cpuinfo_x86 *c) -{ - unsigned long cr0; - u16 fsw, fcw; - - fsw = fcw = 0xffff; - - cr0 = read_cr0(); - cr0 &= ~(X86_CR0_TS | X86_CR0_EM); - write_cr0(cr0); - - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); - - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); - else - clear_cpu_cap(c, X86_FEATURE_FPU); - - /* The final cr0 value is set in fpu_init() */ -} diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c deleted file mode 100644 index 163b5cc582ef..000000000000 --- a/arch/x86/kernel/xsave.c +++ /dev/null @@ -1,724 +0,0 @@ -/* - * xsave/xrstor support. - * - * Author: Suresh Siddha - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Supported feature mask by the CPU and the kernel. - */ -u64 pcntxt_mask; - -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct *init_xstate_buf; - -static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; -static unsigned int *xstate_offsets, *xstate_sizes; -static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8]; -static unsigned int xstate_features; - -/* - * If a processor implementation discern that a processor state component is - * in its initialized state it may modify the corresponding bit in the - * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory - * layout in the case of xsaveopt. While presenting the xstate information to - * the user, we always ensure that the memory layout of a feature will be in - * the init state if the corresponding header bit is zero. This is to ensure - * that the user doesn't see some stale state in the memory layout during - * signal handling, debugging etc. - */ -void __sanitize_i387_state(struct task_struct *tsk) -{ - struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; - int feature_bit = 0x2; - u64 xstate_bv; - - if (!fx) - return; - - xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; - - /* - * None of the feature bits are in init state. So nothing else - * to do for us, as the memory layout is up to date. - */ - if ((xstate_bv & pcntxt_mask) == pcntxt_mask) - return; - - /* - * FP is in init state - */ - if (!(xstate_bv & XSTATE_FP)) { - fx->cwd = 0x37f; - fx->swd = 0; - fx->twd = 0; - fx->fop = 0; - fx->rip = 0; - fx->rdp = 0; - memset(&fx->st_space[0], 0, 128); - } - - /* - * SSE is in init state - */ - if (!(xstate_bv & XSTATE_SSE)) - memset(&fx->xmm_space[0], 0, 256); - - xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; - - /* - * Update all the other memory layouts for which the corresponding - * header bit is in the init state. - */ - while (xstate_bv) { - if (xstate_bv & 0x1) { - int offset = xstate_offsets[feature_bit]; - int size = xstate_sizes[feature_bit]; - - memcpy(((void *) fx) + offset, - ((void *) init_xstate_buf) + offset, - size); - } - - xstate_bv >>= 1; - feature_bit++; - } -} - -/* - * Check for the presence of extended state information in the - * user fpstate pointer in the sigcontext. - */ -static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *fx_sw) -{ - int min_xstate_size = sizeof(struct i387_fxsave_struct) + - sizeof(struct xsave_hdr_struct); - unsigned int magic2; - - if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) - return -1; - - /* Check for the first magic field and other error scenarios. */ - if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || - fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > xstate_size || - fx_sw->xstate_size > fx_sw->extended_size) - return -1; - - /* - * Check for the presence of second magic word at the end of memory - * layout. This detects the case where the user just copied the legacy - * fpstate layout with out copying the extended state information - * in the memory layout. - */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) - || magic2 != FP_XSTATE_MAGIC2) - return -1; - - return 0; -} - -/* - * Signal frame handlers. - */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) -{ - if (use_fxsr()) { - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; - struct user_i387_ia32_struct env; - struct _fpstate_ia32 __user *fp = buf; - - convert_from_fxsr(&env, tsk); - - if (__copy_to_user(buf, &env, sizeof(env)) || - __put_user(xsave->i387.swd, &fp->status) || - __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; - } else { - struct i387_fsave_struct __user *fp = buf; - u32 swd; - if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; - } - - return 0; -} - -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) -{ - struct xsave_struct __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; - u32 xstate_bv; - int err; - - /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); - - if (!use_xsave()) - return err; - - err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); - - /* - * Read the xstate_bv which we copied (directly from the cpu or - * from the state in task struct) to the user buffers. - */ - err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); - - /* - * For legacy compatible, we always set FP/SSE bits in the bit - * vector while saving the state to the user context. This will - * enable us capturing any changes(during sigreturn) to - * the FP/SSE bits by the legacy applications which don't touch - * xstate_bv in the xsave header. - * - * xsave aware apps can change the xstate_bv in the xsave - * header as well as change any contents in the memory layout. - * xrestore as part of sigreturn will capture all the changes. - */ - xstate_bv |= XSTATE_FPSSE; - - err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); - - return err; -} - -static inline int save_user_xstate(struct xsave_struct __user *buf) -{ - int err; - - if (use_xsave()) - err = xsave_user(buf); - else if (use_fxsr()) - err = fxsave_user((struct i387_fxsave_struct __user *) buf); - else - err = fsave_user((struct i387_fsave_struct __user *) buf); - - if (unlikely(err) && __clear_user(buf, xstate_size)) - err = -EFAULT; - return err; -} - -/* - * Save the fpu, extended register state to the user signal frame. - * - * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save - * state is copied. - * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. - * - * buf == buf_fx for 64-bit frames and 32-bit fsave frame. - * buf != buf_fx for 32-bit frames with fxstate. - * - * If the fpu, extended register state is live, save the state directly - * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, - * copy the thread's fpu state to the user frame starting at 'buf_fx'. - * - * If this is a 32-bit frame with fxstate, put a fsave header before - * the aligned state at 'buf_fx'. - * - * For [f]xsave state, update the SW reserved fields in the [f]xsave frame - * indicating the absence/presence of the extended state to the user. - */ -int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) -{ - struct xsave_struct *xsave = ¤t->thread.fpu.state->xsave; - struct task_struct *tsk = current; - int ia32_fxstate = (buf != buf_fx); - - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); - - if (!access_ok(VERIFY_WRITE, buf, size)) - return -EACCES; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), NULL, - (struct _fpstate_ia32 __user *) buf) ? -1 : 1; - - if (user_has_fpu()) { - /* Save the live register state to the user directly. */ - if (save_user_xstate(buf_fx)) - return -1; - /* Update the thread's fxstate to save the fsave header. */ - if (ia32_fxstate) - fpu_fxsave(&tsk->thread.fpu); - } else { - sanitize_i387_state(tsk); - if (__copy_to_user(buf_fx, xsave, xstate_size)) - return -1; - } - - /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; - - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; - - return 0; -} - -static inline void -sanitize_restored_xstate(struct task_struct *tsk, - struct user_i387_ia32_struct *ia32_env, - u64 xstate_bv, int fx_only) -{ - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; - struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr; - - if (use_xsave()) { - /* These bits must be zero. */ - memset(xsave_hdr->reserved, 0, 48); - - /* - * Init the state that is not present in the memory - * layout and not enabled by the OS. - */ - if (fx_only) - xsave_hdr->xstate_bv = XSTATE_FPSSE; - else - xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv); - } - - if (use_fxsr()) { - /* - * mscsr reserved bits must be masked to zero for security - * reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - - convert_to_fxsr(tsk, ia32_env); - } -} - -/* - * Restore the extended state if present. Otherwise, restore the FP/SSE state. - */ -static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) -{ - if (use_xsave()) { - if ((unsigned long)buf % 64 || fx_only) { - u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; - xrstor_state(init_xstate_buf, init_bv); - return fxrstor_user(buf); - } else { - u64 init_bv = pcntxt_mask & ~xbv; - if (unlikely(init_bv)) - xrstor_state(init_xstate_buf, init_bv); - return xrestore_user(buf, xbv); - } - } else if (use_fxsr()) { - return fxrstor_user(buf); - } else - return frstor_user(buf); -} - -int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) -{ - int ia32_fxstate = (buf != buf_fx); - struct task_struct *tsk = current; - int state_size = xstate_size; - u64 xstate_bv = 0; - int fx_only = 0; - - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); - - if (!buf) { - fpu_reset_state(tsk); - return 0; - } - - if (!access_ok(VERIFY_READ, buf, size)) - return -EACCES; - - if (!used_math() && fpstate_alloc_init(tsk)) - return -1; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; - - if (use_xsave()) { - struct _fpx_sw_bytes fx_sw_user; - if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { - /* - * Couldn't find the extended state information in the - * memory layout. Restore just the FP/SSE and init all - * the other extended state. - */ - state_size = sizeof(struct i387_fxsave_struct); - fx_only = 1; - } else { - state_size = fx_sw_user.xstate_size; - xstate_bv = fx_sw_user.xstate_bv; - } - } - - if (ia32_fxstate) { - /* - * For 32-bit frames with fxstate, copy the user state to the - * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. - */ - struct fpu *fpu = &tsk->thread.fpu; - struct user_i387_ia32_struct env; - int err = 0; - - /* - * Drop the current fpu which clears used_math(). This ensures - * that any context-switch during the copy of the new state, - * avoids the intermediate state from getting restored/saved. - * Thus avoiding the new restored state from getting corrupted. - * We will be ready to restore/save the state only after - * set_used_math() is again set. - */ - drop_fpu(tsk); - - if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { - fpstate_init(fpu); - err = -1; - } else { - sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); - } - - set_used_math(); - if (use_eager_fpu()) { - preempt_disable(); - math_state_restore(); - preempt_enable(); - } - - return err; - } else { - /* - * For 64-bit frames and 32-bit fsave frames, restore the user - * state to the registers directly (with exceptions handled). - */ - user_fpu_begin(); - if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { - fpu_reset_state(tsk); - return -1; - } - } - - return 0; -} - -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -static void prepare_fx_sw_frame(void) -{ - int fsave_header_size = sizeof(struct i387_fsave_struct); - int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xstate_bv = pcntxt_mask; - fx_sw_reserved.xstate_size = xstate_size; - - if (config_enabled(CONFIG_IA32_EMULATION)) { - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; - } -} - -/* - * Enable the extended processor state save/restore feature - */ -static inline void xstate_enable(void) -{ - cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); -} - -/* - * Record the offsets and sizes of different state managed by the xsave - * memory layout. - */ -static void __init setup_xstate_features(void) -{ - int eax, ebx, ecx, edx, leaf = 0x2; - - xstate_features = fls64(pcntxt_mask); - xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); - xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); - - do { - cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); - - if (eax == 0) - break; - - xstate_offsets[leaf] = ebx; - xstate_sizes[leaf] = eax; - - leaf++; - } while (1); -} - -/* - * This function sets up offsets and sizes of all extended states in - * xsave area. This supports both standard format and compacted format - * of the xsave aread. - * - * Input: void - * Output: void - */ -void setup_xstate_comp(void) -{ - unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8]; - int i; - - /* - * The FP xstates and SSE xstates are legacy states. They are always - * in the fixed offsets in the xsave area in either compacted form - * or standard form. - */ - xstate_comp_offsets[0] = 0; - xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); - - if (!cpu_has_xsaves) { - for (i = 2; i < xstate_features; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) { - xstate_comp_offsets[i] = xstate_offsets[i]; - xstate_comp_sizes[i] = xstate_sizes[i]; - } - } - return; - } - - xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; - - for (i = 2; i < xstate_features; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) - xstate_comp_sizes[i] = xstate_sizes[i]; - else - xstate_comp_sizes[i] = 0; - - if (i > 2) - xstate_comp_offsets[i] = xstate_comp_offsets[i-1] - + xstate_comp_sizes[i-1]; - - } -} - -/* - * setup the xstate image representing the init state - */ -static void __init setup_init_fpu_buf(void) -{ - /* - * Setup init_xstate_buf to represent the init state of - * all the features managed by the xsave - */ - init_xstate_buf = alloc_bootmem_align(xstate_size, - __alignof__(struct xsave_struct)); - fx_finit(&init_xstate_buf->i387); - - if (!cpu_has_xsave) - return; - - setup_xstate_features(); - - if (cpu_has_xsaves) { - init_xstate_buf->xsave_hdr.xcomp_bv = - (u64)1 << 63 | pcntxt_mask; - init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask; - } - - /* - * Init all the features state with header_bv being 0x0 - */ - xrstor_state_booting(init_xstate_buf, -1); - /* - * Dump the init state again. This is to identify the init state - * of any feature which is not represented by all zero's. - */ - xsave_state_booting(init_xstate_buf, -1); -} - -static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; -static int __init eager_fpu_setup(char *s) -{ - if (!strcmp(s, "on")) - eagerfpu = ENABLE; - else if (!strcmp(s, "off")) - eagerfpu = DISABLE; - else if (!strcmp(s, "auto")) - eagerfpu = AUTO; - return 1; -} -__setup("eagerfpu=", eager_fpu_setup); - - -/* - * Calculate total size of enabled xstates in XCR0/pcntxt_mask. - */ -static void __init init_xstate_size(void) -{ - unsigned int eax, ebx, ecx, edx; - int i; - - if (!cpu_has_xsaves) { - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; - return; - } - - xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < 64; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_size += eax; - } - } -} - -/* - * Enable and initialize the xsave feature. - */ -static void __init xstate_enable_boot_cpu(void) -{ - unsigned int eax, ebx, ecx, edx; - - if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { - WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); - return; - } - - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - pcntxt_mask = eax + ((u64)edx << 32); - - if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - pr_err("FP/SSE not shown under xsave features 0x%llx\n", - pcntxt_mask); - BUG(); - } - - /* - * Support only the state known to OS. - */ - pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - - xstate_enable(); - - /* - * Recompute the context size for enabled features - */ - init_xstate_size(); - - update_regset_xstate_info(xstate_size, pcntxt_mask); - prepare_fx_sw_frame(); - setup_init_fpu_buf(); - - /* Auto enable eagerfpu for xsaveopt */ - if (cpu_has_xsaveopt && eagerfpu != DISABLE) - eagerfpu = ENABLE; - - if (pcntxt_mask & XSTATE_EAGER) { - if (eagerfpu == DISABLE) { - pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n", - pcntxt_mask & XSTATE_EAGER); - pcntxt_mask &= ~XSTATE_EAGER; - } else { - eagerfpu = ENABLE; - } - } - - pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n", - pcntxt_mask, xstate_size, - cpu_has_xsaves ? "compacted form" : "standard form"); -} - -/* - * For the very first instance, this calls xstate_enable_boot_cpu(); - * for all subsequent instances, this calls xstate_enable(). - * - * This is somewhat obfuscated due to the lack of powerful enough - * overrides for the section checks. - */ -void xsave_init(void) -{ - static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; - void (*this_func)(void); - - if (!cpu_has_xsave) - return; - - this_func = next_func; - next_func = xstate_enable; - this_func(); -} - -/* - * setup_init_fpu_buf() is __init and it is OK to call it here because - * init_xstate_buf will be unset only once during boot. - */ -void __init_refok eager_fpu_init(void) -{ - WARN_ON(used_math()); - current_thread_info()->status = 0; - - if (eagerfpu == ENABLE) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); - - if (!cpu_has_eager_fpu) { - stts(); - return; - } - - if (!init_xstate_buf) - setup_init_fpu_buf(); -} - -/* - * Given the xsave area and a state inside, this function returns the - * address of the state. - * - * This is the API that is called to get xstate address in either - * standard format or compacted format of xsave area. - * - * Inputs: - * xsave: base address of the xsave area; - * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, - * etc.) - * Output: - * address of the state in the xsave area. - */ -void *get_xsave_addr(struct xsave_struct *xsave, int xstate) -{ - int feature = fls64(xstate) - 1; - if (!test_bit(feature, (unsigned long *)&pcntxt_mask)) - return NULL; - - return (void *)xsave + xstate_comp_offsets[feature]; -} -EXPORT_SYMBOL_GPL(get_xsave_addr); -- cgit v1.2.1 From f89e32e0a3df2f29d61fdc120ac62654ef267111 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 10:58:10 +0200 Subject: x86/fpu: Fix header file dependencies of fpu-internal.h Fix a minor header file dependency bug in asm/fpu-internal.h: it relies on i387.h but does not include it. All users of fpu-internal.h included it explicitly. Also remove unnecessary includes, to reduce compilation time. This also makes it easier to use it as a standalone header file for FPU internals, such as an upcoming C module in arch/x86/kernel/fpu/. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/crc32c-intel_glue.c | 1 - arch/x86/crypto/sha-mb/sha1_mb.c | 1 - arch/x86/ia32/ia32_signal.c | 1 - arch/x86/include/asm/fpu-internal.h | 9 ++------- arch/x86/kernel/cpu/common.c | 1 - arch/x86/kernel/process.c | 1 - arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/signal.c | 1 - arch/x86/kernel/smpboot.c | 1 - arch/x86/kernel/traps.c | 1 - arch/x86/kvm/x86.c | 2 +- arch/x86/mm/mpx.c | 1 - 14 files changed, 3 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 28640c3d6af7..470522cb042a 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -32,7 +32,6 @@ #include #include -#include #include #define CHKSUM_BLOCK_SIZE 1 diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index e510b1c5d690..15373786494f 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -65,7 +65,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index c81d35e6c7f1..4bafd5b05aca 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index f85d21b68901..c3b7bd12f18f 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -10,18 +10,13 @@ #ifndef _FPU_INTERNAL_H #define _FPU_INTERNAL_H -#include #include #include #include -#include -#include -#include -#include + #include -#include +#include #include -#include #ifdef CONFIG_X86_64 # include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b8035b8dd186..220ad95e0e28 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c7793addc237..35d0f1925524 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8ed2106b06da..84d647d4b14d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #ifdef CONFIG_MATH_EMULATION diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ddfdbf74f174..ae6efeccb46e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -38,7 +38,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a7bc79480719..69451b8965f7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 1ea14fd53933..35f867aa597e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -26,7 +26,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..60e331ceb844 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,7 +68,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 231aa579d9cd..465b335e7491 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 26b1f89fc608..be276e0fe0ff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include /* Ugh! */ #include #include diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index c439ec478216..37ad432e7f16 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include -- cgit v1.2.1 From 0c8675379048f36c76ad3a46519310ee2d626b2f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 10:53:34 +0200 Subject: x86/fpu: Split out the boot time FPU init code into fpu/init.c Move boot time FPU initialization code into init.c, to better isolate it into its own domain. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/Makefile | 2 +- arch/x86/kernel/fpu/core.c | 88 ----------------------------------------- arch/x86/kernel/fpu/init.c | 93 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 89 deletions(-) create mode 100644 arch/x86/kernel/fpu/init.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile index 89fd66a4b3a1..50464a716b87 100644 --- a/arch/x86/kernel/fpu/Makefile +++ b/arch/x86/kernel/fpu/Makefile @@ -2,4 +2,4 @@ # Build rules for the FPU support code: # -obj-y += core.o xsave.o +obj-y += init.o core.o xsave.o diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 01101553c6c1..9866a580952f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -139,94 +139,6 @@ void fpu__save(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(fpu__save); -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; -unsigned int xstate_size; -EXPORT_SYMBOL_GPL(xstate_size); -static struct i387_fxsave_struct fx_scratch; - -static void mxcsr_feature_mask_init(void) -{ - unsigned long mask = 0; - - if (cpu_has_fxsr) { - memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : "+m" (fx_scratch)); - mask = fx_scratch.mxcsr_mask; - if (mask == 0) - mask = 0x0000ffbf; - } - mxcsr_feature_mask &= mask; -} - -static void fpstate_xstate_init_size(void) -{ - /* - * Note that xstate_size might be overwriten later during - * xsave_init(). - */ - - if (!cpu_has_fpu) { - /* - * Disable xsave as we do not support it if i387 - * emulation is enabled. - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - xstate_size = sizeof(struct i387_soft_struct); - return; - } - - if (cpu_has_fxsr) - xstate_size = sizeof(struct i387_fxsave_struct); - else - xstate_size = sizeof(struct i387_fsave_struct); -} - -/* - * Called on the boot CPU at bootup to set up the initial FPU state that - * is later cloned into all processes. - * - * Also called on secondary CPUs to set up the FPU state of their - * idle threads. - */ -void fpu__cpu_init(void) -{ - unsigned long cr0; - unsigned long cr4_mask = 0; - -#ifndef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) { - pr_emerg("No FPU found and no math emulation present\n"); - pr_emerg("Giving up\n"); - for (;;) - asm volatile("hlt"); - } -#endif - if (cpu_has_fxsr) - cr4_mask |= X86_CR4_OSFXSR; - if (cpu_has_xmm) - cr4_mask |= X86_CR4_OSXMMEXCPT; - if (cr4_mask) - cr4_set_bits(cr4_mask); - - cr0 = read_cr0(); - cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ - if (!cpu_has_fpu) - cr0 |= X86_CR0_EM; - write_cr0(cr0); - - /* - * fpstate_xstate_init_size() is only called once, to avoid overriding - * 'xstate_size' during (secondary CPU) bootup or during CPU hotplug. - */ - if (xstate_size == 0) - fpstate_xstate_init_size(); - - mxcsr_feature_mask_init(); - xsave_init(); - eager_fpu_init(); -} - void fpstate_init(struct fpu *fpu) { if (!cpu_has_fpu) { diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c new file mode 100644 index 000000000000..0a666298abbd --- /dev/null +++ b/arch/x86/kernel/fpu/init.c @@ -0,0 +1,93 @@ +/* + * x86 FPU boot time init code + */ +#include +#include + +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +unsigned int xstate_size; +EXPORT_SYMBOL_GPL(xstate_size); +static struct i387_fxsave_struct fx_scratch; + +static void mxcsr_feature_mask_init(void) +{ + unsigned long mask = 0; + + if (cpu_has_fxsr) { + memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : "+m" (fx_scratch)); + mask = fx_scratch.mxcsr_mask; + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; +} + +static void fpstate_xstate_init_size(void) +{ + /* + * Note that xstate_size might be overwriten later during + * xsave_init(). + */ + + if (!cpu_has_fpu) { + /* + * Disable xsave as we do not support it if i387 + * emulation is enabled. + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + xstate_size = sizeof(struct i387_soft_struct); + return; + } + + if (cpu_has_fxsr) + xstate_size = sizeof(struct i387_fxsave_struct); + else + xstate_size = sizeof(struct i387_fsave_struct); +} + +/* + * Called on the boot CPU at bootup to set up the initial FPU state that + * is later cloned into all processes. + * + * Also called on secondary CPUs to set up the FPU state of their + * idle threads. + */ +void fpu__cpu_init(void) +{ + unsigned long cr0; + unsigned long cr4_mask = 0; + +#ifndef CONFIG_MATH_EMULATION + if (!cpu_has_fpu) { + pr_emerg("No FPU found and no math emulation present\n"); + pr_emerg("Giving up\n"); + for (;;) + asm volatile("hlt"); + } +#endif + if (cpu_has_fxsr) + cr4_mask |= X86_CR4_OSFXSR; + if (cpu_has_xmm) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + cr4_set_bits(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!cpu_has_fpu) + cr0 |= X86_CR0_EM; + write_cr0(cr0); + + /* + * fpstate_xstate_init_size() is only called once, to avoid overriding + * 'xstate_size' during (secondary CPU) bootup or during CPU hotplug. + */ + if (xstate_size == 0) + fpstate_xstate_init_size(); + + mxcsr_feature_mask_init(); + xsave_init(); + eager_fpu_init(); +} -- cgit v1.2.1 From 4445e6e9a549823a2c2a188e500389532e1ed501 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 11:33:25 +0200 Subject: x86/fpu: Remove unnecessary includes from core.c fpu/core.c includes a lot of files for mostly historic reasons. It only needs fpu-internal.h, which already includes all the required headers. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9866a580952f..b05199fa168c 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -5,20 +5,7 @@ * General FPU state handling cleanups * Gareth Hughes , May 2000 */ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include #include -#include static DEFINE_PER_CPU(bool, in_kernel_fpu); -- cgit v1.2.1 From 146ed598d12ac173bf5fed05ba7046812b8a8978 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 11:36:14 +0200 Subject: x86/fpu: Move the no_387 handling and FPU detection code into init.c Both no_387() and fpu__detect() run at boot time, so they belong into init.c. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 34 ---------------------------------- arch/x86/kernel/fpu/init.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b05199fa168c..9211582f5d3f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -581,37 +581,3 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) EXPORT_SYMBOL(dump_fpu); #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ - -static int __init no_387(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FPU); - return 1; -} - -__setup("no387", no_387); - -/* - * Set the X86_FEATURE_FPU CPU-capability bit based on - * trying to execute an actual sequence of FPU instructions: - */ -void fpu__detect(struct cpuinfo_x86 *c) -{ - unsigned long cr0; - u16 fsw, fcw; - - fsw = fcw = 0xffff; - - cr0 = read_cr0(); - cr0 &= ~(X86_CR0_TS | X86_CR0_EM); - write_cr0(cr0); - - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); - - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); - else - clear_cpu_cap(c, X86_FEATURE_FPU); - - /* The final cr0 value is set in fpu_init() */ -} diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 0a666298abbd..5e06aa6cc22e 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -91,3 +91,37 @@ void fpu__cpu_init(void) xsave_init(); eager_fpu_init(); } + +static int __init no_387(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FPU); + return 1; +} + +__setup("no387", no_387); + +/* + * Set the X86_FEATURE_FPU CPU-capability bit based on + * trying to execute an actual sequence of FPU instructions: + */ +void fpu__detect(struct cpuinfo_x86 *c) +{ + unsigned long cr0; + u16 fsw, fcw; + + fsw = fcw = 0xffff; + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS | X86_CR0_EM); + write_cr0(cr0); + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); + + if (fsw == 0 && (fcw & 0x103f) == 0x003f) + set_cpu_cap(c, X86_FEATURE_FPU); + else + clear_cpu_cap(c, X86_FEATURE_FPU); + + /* The final cr0 value is set in fpu_init() */ +} -- cgit v1.2.1 From 11ad19277e025f914518bc2943a240cdd37cf844 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 11:44:46 +0200 Subject: x86/fpu: Remove the free_thread_xstate() complication Use fpstate_free() directly to manage FPU state. Only process.c was using this method, so this is a speedup as well, as it removes the extra function call and related clobbers. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/process.c | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6b75c4b927ec..fef8db024ece 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -362,7 +362,6 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack); #endif /* X86_64 */ extern unsigned int xstate_size; -extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; struct perf_event; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 35d0f1925524..6ab180f40a7e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -99,14 +99,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } -void free_thread_xstate(struct task_struct *tsk) -{ - fpstate_free(&tsk->thread.fpu); -} - void arch_release_task_struct(struct task_struct *tsk) { - free_thread_xstate(tsk); + fpstate_free(&tsk->thread.fpu); } void arch_task_cache_init(void) @@ -154,7 +149,7 @@ void flush_thread(void) if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ drop_fpu(tsk); - free_thread_xstate(tsk); + fpstate_free(&tsk->thread.fpu); } else { if (!tsk_used_math(tsk)) { /* kthread execs. TODO: cleanup this horror. */ -- cgit v1.2.1 From 81683cc8277e79decff4d0cf82ae0e17d2fe465f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 11:52:13 +0200 Subject: x86/fpu: Factor out fpu__flush_thread() from flush_thread() flush_thread() open codes a lot of FPU internals - create a separate function for it in fpu/core.c. Turns out that this does not hurt performance: text data bss dec hex filename 11843039 1884440 1130496 14857975 e2b6f7 vmlinux.before 11843039 1884440 1130496 14857975 e2b6f7 vmlinux.after and since this is a slowpath clarity comes first anyway. We can reconsider inlining decisions after the FPU code has been cleaned up. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 1 + arch/x86/kernel/fpu/core.c | 17 +++++++++++++++++ arch/x86/kernel/process.c | 14 +------------- 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 6552a16e0e38..d6fc84440b73 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -20,6 +20,7 @@ struct user_i387_struct; extern int fpstate_alloc_init(struct task_struct *curr); extern void fpstate_init(struct fpu *fpu); +extern void fpu__flush_thread(struct task_struct *tsk); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void math_state_restore(void); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9211582f5d3f..787bf57b8422 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -227,6 +227,23 @@ static int fpu__unlazy_stopped(struct task_struct *child) return 0; } +void fpu__flush_thread(struct task_struct *tsk) +{ + if (!use_eager_fpu()) { + /* FPU state will be reallocated lazily at the first use. */ + drop_fpu(tsk); + fpstate_free(&tsk->thread.fpu); + } else { + if (!tsk_used_math(tsk)) { + /* kthread execs. TODO: cleanup this horror. */ + if (WARN_ON(fpstate_alloc_init(tsk))) + force_sig(SIGKILL, tsk); + user_fpu_begin(); + } + restore_init_xstate(); + } +} + /* * The xstateregs_active() routine is the same as the fpregs_active() routine, * as the "regset->n" for the xstate regset will be updated based on the feature diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6ab180f40a7e..52fd8f6f44c7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -146,19 +146,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - if (!use_eager_fpu()) { - /* FPU state will be reallocated lazily at the first use. */ - drop_fpu(tsk); - fpstate_free(&tsk->thread.fpu); - } else { - if (!tsk_used_math(tsk)) { - /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(fpstate_alloc_init(tsk))) - force_sig(SIGKILL, tsk); - user_fpu_begin(); - } - restore_init_xstate(); - } + fpu__flush_thread(tsk); } static void hard_disable_TSC(void) -- cgit v1.2.1 From 93b90712c64ca2db4b39fcb2e7dffcf0d478468d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 12:50:13 +0200 Subject: x86/fpu: Move math_state_restore() to fpu/core.c It's another piece of FPU internals that is better off close to the other FPU internals. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 42 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 42 ------------------------------------------ 2 files changed, 42 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 787bf57b8422..7add2fb7369e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -227,6 +227,48 @@ static int fpu__unlazy_stopped(struct task_struct *child) return 0; } +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (eg with local + * local interrupts as in the case of do_device_not_available). + */ +void math_state_restore(void) +{ + struct task_struct *tsk = current; + + if (!tsk_used_math(tsk)) { + local_irq_enable(); + /* + * does a slab alloc which can sleep + */ + if (fpstate_alloc_init(tsk)) { + /* + * ran out of memory! + */ + do_group_exit(SIGKILL); + return; + } + local_irq_disable(); + } + + /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */ + kernel_fpu_disable(); + __thread_fpu_begin(tsk); + if (unlikely(restore_fpu_checking(tsk))) { + fpu_reset_state(tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); + } else { + tsk->thread.fpu.counter++; + } + kernel_fpu_enable(); +} +EXPORT_SYMBOL_GPL(math_state_restore); + void fpu__flush_thread(struct task_struct *tsk) { if (!use_eager_fpu()) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 465b335e7491..63c7fc3677b4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -826,48 +826,6 @@ asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) { } -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (eg with local - * local interrupts as in the case of do_device_not_available). - */ -void math_state_restore(void) -{ - struct task_struct *tsk = current; - - if (!tsk_used_math(tsk)) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (fpstate_alloc_init(tsk)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */ - kernel_fpu_disable(); - __thread_fpu_begin(tsk); - if (unlikely(restore_fpu_checking(tsk))) { - fpu_reset_state(tsk); - force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); - } else { - tsk->thread.fpu.counter++; - } - kernel_fpu_enable(); -} -EXPORT_SYMBOL_GPL(math_state_restore); - dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { -- cgit v1.2.1 From 3a0aee4801d475b64a408539c01ec0d17d52192b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 13:16:47 +0200 Subject: x86/fpu: Rename math_state_restore() to fpu__restore() Move to the new fpu__*() namespace. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 2 +- arch/x86/kernel/fpu/core.c | 6 +++--- arch/x86/kernel/fpu/xsave.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/traps.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index d6fc84440b73..c8ee395dd6c6 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -23,7 +23,7 @@ extern void fpstate_init(struct fpu *fpu); extern void fpu__flush_thread(struct task_struct *tsk); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); -extern void math_state_restore(void); +extern void fpu__restore(void); extern bool irq_fpu_usable(void); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7add2fb7369e..15c3cf7bd160 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -228,7 +228,7 @@ static int fpu__unlazy_stopped(struct task_struct *child) } /* - * 'math_state_restore()' saves the current math information in the + * 'fpu__restore()' saves the current math information in the * old math state array, and gets the new ones from the current task * * Careful.. There are problems with IBM-designed IRQ13 behaviour. @@ -237,7 +237,7 @@ static int fpu__unlazy_stopped(struct task_struct *child) * Must be called with kernel preemption disabled (eg with local * local interrupts as in the case of do_device_not_available). */ -void math_state_restore(void) +void fpu__restore(void) { struct task_struct *tsk = current; @@ -267,7 +267,7 @@ void math_state_restore(void) } kernel_fpu_enable(); } -EXPORT_SYMBOL_GPL(math_state_restore); +EXPORT_SYMBOL_GPL(fpu__restore); void fpu__flush_thread(struct task_struct *tsk) { diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 163b5cc582ef..d913d5024901 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -404,7 +404,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) set_used_math(); if (use_eager_fpu()) { preempt_disable(); - math_state_restore(); + fpu__restore(); preempt_enable(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 84d647d4b14d..1a0edce626b2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -295,7 +295,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up + * done before fpu__restore(), so the TS bit is up * to date. */ arch_end_context_switch(next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ae6efeccb46e..99cc4b8589ad 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -298,7 +298,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Leave lazy mode, flushing any hypercalls made here. This * must be done after loading TLS entries in the GDT but before * loading segments that might reference them, and and it must - * be done before math_state_restore, so the TS bit is up to + * be done before fpu__restore(), so the TS bit is up to * date. */ arch_end_context_switch(next_p); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 63c7fc3677b4..22ad90a40dbf 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -846,7 +846,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) return; } #endif - math_state_restore(); /* interrupts still off */ + fpu__restore(); /* interrupts still off */ #ifdef CONFIG_X86_32 conditional_sti(regs); #endif -- cgit v1.2.1 From 4d1640927bd54aa118f91c2bcfe6c2de0e2ba2a3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 13:44:25 +0200 Subject: x86/fpu: Factor out the FPU bug detection code into fpu__init_check_bugs() Move the boot-time FPU bug detection code to the other FPU boot time init code in fpu/init.c. No change in code size: text data bss dec hex filename 13044568 1884440 1130496 16059504 f50c70 vmlinux.before 13044568 1884440 1130496 16059504 f50c70 vmlinux.after Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 1 + arch/x86/kernel/cpu/bugs.c | 53 +------------------------------------- arch/x86/kernel/fpu/init.c | 63 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c8ee395dd6c6..89ae3e051741 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -24,6 +24,7 @@ extern void fpu__flush_thread(struct task_struct *tsk); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void fpu__restore(void); +extern void fpu__init_check_bugs(void); extern bool irq_fpu_usable(void); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 03445346ee0a..eb8be0c5823b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,52 +17,6 @@ #include #include -static double __initdata x = 4195835.0; -static double __initdata y = 3145727.0; - -/* - * This used to check for exceptions.. - * However, it turns out that to support that, - * the XMM trap handlers basically had to - * be buggy. So let's have a correct XMM trap - * handler, and forget about printing out - * some status at boot. - * - * We should really only care about bugs here - * anyway. Not features. - */ -static void __init check_fpu(void) -{ - s32 fdiv_bug; - - kernel_fpu_begin(); - - /* - * trap_init() enabled FXSR and company _before_ testing for FP - * problems here. - * - * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug - */ - __asm__("fninit\n\t" - "fldl %1\n\t" - "fdivl %2\n\t" - "fmull %2\n\t" - "fldl %1\n\t" - "fsubp %%st,%%st(1)\n\t" - "fistpl %0\n\t" - "fwait\n\t" - "fninit" - : "=m" (*&fdiv_bug) - : "m" (*&x), "m" (*&y)); - - kernel_fpu_end(); - - if (fdiv_bug) { - set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); - pr_warn("Hmm, FPU with FDIV bug\n"); - } -} - void __init check_bugs(void) { identify_boot_cpu(); @@ -85,10 +39,5 @@ void __init check_bugs(void) '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); - /* - * kernel_fpu_begin/end() in check_fpu() relies on the patched - * alternative instructions. - */ - if (cpu_has_fpu) - check_fpu(); + fpu__init_check_bugs(); } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 5e06aa6cc22e..4eabb426e910 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -4,6 +4,69 @@ #include #include +/* + * Boot time CPU/FPU FDIV bug detection code: + */ + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +static void __init check_fpu(void) +{ + s32 fdiv_bug; + + kernel_fpu_begin(); + + /* + * trap_init() enabled FXSR and company _before_ testing for FP + * problems here. + * + * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug + */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&fdiv_bug) + : "m" (*&x), "m" (*&y)); + + kernel_fpu_end(); + + if (fdiv_bug) { + set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); + pr_warn("Hmm, FPU with FDIV bug\n"); + } +} + +void fpu__init_check_bugs(void) +{ + /* + * kernel_fpu_begin/end() in check_fpu() relies on the patched + * alternative instructions. + */ + if (cpu_has_fpu) + check_fpu(); +} + +/* + * Boot time FPU feature detection code: + */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -- cgit v1.2.1 From 3e261c14e41b3a3cc1da54190d56b8609bedd873 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 15:08:34 +0200 Subject: x86/fpu: Simplify the xsave_state*() methods These functions (xsave_state() and xsave_state_booting()) have a 'mask' argument that is always -1. Propagate this into the functions instead and eliminate the extra argument. Does not change the generated code, because these were inlined functions. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 4 ++-- arch/x86/include/asm/xsave.h | 8 +++++--- arch/x86/kernel/fpu/xsave.c | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index c3b7bd12f18f..b1f5dd63cfeb 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -527,9 +527,9 @@ static inline void __save_fpu(struct task_struct *tsk) { if (use_xsave()) { if (unlikely(system_state == SYSTEM_BOOTING)) - xsave_state_booting(&tsk->thread.fpu.state->xsave, -1); + xsave_state_booting(&tsk->thread.fpu.state->xsave); else - xsave_state(&tsk->thread.fpu.state->xsave, -1); + xsave_state(&tsk->thread.fpu.state->xsave); } else fpu_fxsave(&tsk->thread.fpu); } diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 58ed0ca5a11e..61c951ce77fe 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -70,8 +70,9 @@ extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask) +static inline int xsave_state_booting(struct xsave_struct *fx) { + u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; int err = 0; @@ -123,8 +124,9 @@ static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) /* * Save processor xstate to xsave area. */ -static inline int xsave_state(struct xsave_struct *fx, u64 mask) +static inline int xsave_state(struct xsave_struct *fx) { + u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; int err = 0; @@ -189,7 +191,7 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask) */ static inline void fpu_xsave(struct fpu *fpu) { - xsave_state(&fpu->state->xsave, -1); + xsave_state(&fpu->state->xsave); } /* diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index d913d5024901..a52205b87acb 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -558,11 +558,12 @@ static void __init setup_init_fpu_buf(void) * Init all the features state with header_bv being 0x0 */ xrstor_state_booting(init_xstate_buf, -1); + /* * Dump the init state again. This is to identify the init state * of any feature which is not represented by all zero's. */ - xsave_state_booting(init_xstate_buf, -1); + xsave_state_booting(init_xstate_buf); } static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; -- cgit v1.2.1 From 0afc4a941c0e7b8f6f619fe576f7c5ddbe78d304 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 15:14:44 +0200 Subject: x86/fpu: Remove fpu_xsave() It's a pointless wrapper now - use xsave_state(). Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 +- arch/x86/include/asm/xsave.h | 8 -------- arch/x86/mm/mpx.c | 2 +- 3 files changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index b1f5dd63cfeb..95e04cb1ed2f 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -265,7 +265,7 @@ static inline void fpu_fxsave(struct fpu *fpu) static inline int fpu_save_init(struct fpu *fpu) { if (use_xsave()) { - fpu_xsave(fpu); + xsave_state(&fpu->state->xsave); /* * xsave header may indicate the init state of the FP. diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 61c951ce77fe..7c90ea93c54e 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -186,14 +186,6 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask) return err; } -/* - * Save xstate context for old process during context switch. - */ -static inline void fpu_xsave(struct fpu *fpu) -{ - xsave_state(&fpu->state->xsave); -} - /* * Restore xstate context for new process during context switch. */ diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 37ad432e7f16..412b5f81e547 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -389,7 +389,7 @@ int mpx_enable_management(struct task_struct *tsk) * directory into XSAVE/XRSTOR Save Area and enable MPX through * XRSTOR instruction. * - * fpu_xsave() is expected to be very expensive. Storing the bounds + * xsave_state() is expected to be very expensive. Storing the bounds * directory here means that we do not have to do xsave in the unmap * path; we can just use mm->bd_addr instead. */ -- cgit v1.2.1 From 8ffb53ab986ccb4421b1060182c6e084edd7b9d8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 15:41:56 +0200 Subject: x86/fpu: Move task_xstate_cachep handling to core.c This code was historically in process.c, now we have FPU core internals in fpu/core.c instead - move it there. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 ++ arch/x86/kernel/fpu/core.c | 15 +++++++++++++++ arch/x86/kernel/process.c | 9 +-------- 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 95e04cb1ed2f..f41170c6d376 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -564,6 +564,8 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } +extern void fpstate_cache_init(void); + extern int fpstate_alloc(struct fpu *fpu); static inline void fpstate_free(struct fpu *fpu) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 15c3cf7bd160..b32a6eb7f189 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -147,6 +147,21 @@ void fpstate_init(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpstate_init); +/* + * FPU state allocation: + */ +struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); + +void fpstate_cache_init(void) +{ + task_xstate_cachep = + kmem_cache_create("task_xstate", xstate_size, + __alignof__(union thread_xstate), + SLAB_PANIC | SLAB_NOTRACK, NULL); + setup_xstate_comp(); +} + int fpstate_alloc(struct fpu *fpu) { if (fpu->state) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 52fd8f6f44c7..36d9f737f278 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -75,9 +75,6 @@ void idle_notifier_unregister(struct notifier_block *n) EXPORT_SYMBOL_GPL(idle_notifier_unregister); #endif -struct kmem_cache *task_xstate_cachep; -EXPORT_SYMBOL_GPL(task_xstate_cachep); - /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. @@ -106,11 +103,7 @@ void arch_release_task_struct(struct task_struct *tsk) void arch_task_cache_init(void) { - task_xstate_cachep = - kmem_cache_create("task_xstate", xstate_size, - __alignof__(union thread_xstate), - SLAB_PANIC | SLAB_NOTRACK, NULL); - setup_xstate_comp(); + fpstate_cache_init(); } /* -- cgit v1.2.1 From a752b53d9dcbae28a3a22b5577f0571acf53d5aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 15:47:05 +0200 Subject: x86/fpu: Factor out fpu__copy() Introduce fpu__copy() and use it in arch_dup_task_struct(), thus moving another chunk of FPU logic to fpu/core.c. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 1 + arch/x86/kernel/fpu/core.c | 18 ++++++++++++++++++ arch/x86/kernel/process.c | 12 +----------- 3 files changed, 20 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index f41170c6d376..6c1ceb7c3f9a 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -567,6 +567,7 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) extern void fpstate_cache_init(void); extern int fpstate_alloc(struct fpu *fpu); +extern int fpu__copy(struct task_struct *dst, struct task_struct *src); static inline void fpstate_free(struct fpu *fpu) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b32a6eb7f189..05df212449ed 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -178,6 +178,24 @@ int fpstate_alloc(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpstate_alloc); +int fpu__copy(struct task_struct *dst, struct task_struct *src) +{ + dst->thread.fpu.counter = 0; + dst->thread.fpu.has_fpu = 0; + dst->thread.fpu.state = NULL; + + task_disable_lazy_fpu_restore(dst); + + if (tsk_used_math(src)) { + int err = fpstate_alloc(&dst->thread.fpu); + + if (err) + return err; + fpu_copy(dst, src); + } + return 0; +} + /* * Allocate the backing store for the current task's FPU registers * and initialize the registers themselves as well. diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 36d9f737f278..bb7d4abcdad6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -83,17 +83,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; - dst->thread.fpu.counter = 0; - dst->thread.fpu.has_fpu = 0; - dst->thread.fpu.state = NULL; - task_disable_lazy_fpu_restore(dst); - if (tsk_used_math(src)) { - int err = fpstate_alloc(&dst->thread.fpu); - if (err) - return err; - fpu_copy(dst, src); - } - return 0; + return fpu__copy(dst, src); } void arch_release_task_struct(struct task_struct *tsk) -- cgit v1.2.1 From 5a12bf6332da40310eff5575ca1ba20339d74e48 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 15:58:37 +0200 Subject: x86/fpu: Uninline fpstate_free() and move it next to the allocation function Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 9 +-------- arch/x86/kernel/fpu/core.c | 9 +++++++++ 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 6c1ceb7c3f9a..16a1c66cf4ee 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -567,16 +567,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) extern void fpstate_cache_init(void); extern int fpstate_alloc(struct fpu *fpu); +extern void fpstate_free(struct fpu *fpu); extern int fpu__copy(struct task_struct *dst, struct task_struct *src); -static inline void fpstate_free(struct fpu *fpu) -{ - if (fpu->state) { - kmem_cache_free(task_xstate_cachep, fpu->state); - fpu->state = NULL; - } -} - static inline void fpu_copy(struct task_struct *dst, struct task_struct *src) { if (use_eager_fpu()) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 05df212449ed..b00d1b3c5811 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -178,6 +178,15 @@ int fpstate_alloc(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpstate_alloc); +void fpstate_free(struct fpu *fpu) +{ + if (fpu->state) { + kmem_cache_free(task_xstate_cachep, fpu->state); + fpu->state = NULL; + } +} +EXPORT_SYMBOL_GPL(fpstate_free); + int fpu__copy(struct task_struct *dst, struct task_struct *src) { dst->thread.fpu.counter = 0; -- cgit v1.2.1 From f55f88e25e9b5232054a82d47de7aaf67179b78b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 16:02:33 +0200 Subject: x86/fpu: Make task_xstate_cachep static It's now local to fpu/core.c, make it static. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/fpu/core.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fef8db024ece..d50cc7f61559 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -362,7 +362,6 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack); #endif /* X86_64 */ extern unsigned int xstate_size; -extern struct kmem_cache *task_xstate_cachep; struct perf_event; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b00d1b3c5811..d0fcf741f70b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -150,8 +150,7 @@ EXPORT_SYMBOL_GPL(fpstate_init); /* * FPU state allocation: */ -struct kmem_cache *task_xstate_cachep; -EXPORT_SYMBOL_GPL(task_xstate_cachep); +static struct kmem_cache *task_xstate_cachep; void fpstate_cache_init(void) { -- cgit v1.2.1 From 416d49ac67ae3af8c98ecee2ebe0a883b95e213a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 16:33:08 +0200 Subject: x86/fpu: Make kernel_fpu_disable/enable() static This allows the compiler to inline them and to eliminate them: arch/x86/kernel/fpu/core.o: text data bss dec hex filename 6741 4 8 6753 1a61 core.o.before 6716 4 8 6728 1a48 core.o.after Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 4 ---- arch/x86/kernel/fpu/core.c | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 89ae3e051741..e69989f95da5 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -54,10 +54,6 @@ static inline void kernel_fpu_end(void) preempt_enable(); } -/* Must be called with preempt disabled */ -extern void kernel_fpu_disable(void); -extern void kernel_fpu_enable(void); - /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d0fcf741f70b..161820526ad3 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -9,13 +9,13 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); -void kernel_fpu_disable(void) +static void kernel_fpu_disable(void) { WARN_ON(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); } -void kernel_fpu_enable(void) +static void kernel_fpu_enable(void) { this_cpu_write(in_kernel_fpu, false); } @@ -32,7 +32,7 @@ void kernel_fpu_enable(void) * Except for the eagerfpu case when we return true; in the likely case * the thread has FPU but we are not going to set/clear TS. */ -static inline bool interrupted_kernel_fpu_idle(void) +static bool interrupted_kernel_fpu_idle(void) { if (this_cpu_read(in_kernel_fpu)) return false; @@ -52,7 +52,7 @@ static inline bool interrupted_kernel_fpu_idle(void) * in an interrupt context from user mode - we'll just * save the FPU state as required. */ -static inline bool interrupted_user_mode(void) +static bool interrupted_user_mode(void) { struct pt_regs *regs = get_irq_regs(); return regs && user_mode(regs); -- cgit v1.2.1 From 3103ae3a6d3e66d51bb883bb17b55574e163b77d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 16:40:56 +0200 Subject: x86/fpu: Add debug check to kernel_fpu_disable() We are not supposed to call kernel_fpu_disable() if we have not previously enabled it. Also use kernel_fpu_disable()/enable() in the __kernel_fpu_begin/end() primitives, instead of writing to in_kernel_fpu directly, so that we get the debugging checks. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 161820526ad3..9bc573a5c9db 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -17,6 +17,7 @@ static void kernel_fpu_disable(void) static void kernel_fpu_enable(void) { + WARN_ON_ONCE(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); } @@ -77,7 +78,7 @@ void __kernel_fpu_begin(void) { struct task_struct *me = current; - this_cpu_write(in_kernel_fpu, true); + kernel_fpu_disable(); if (__thread_has_fpu(me)) { __save_init_fpu(me); @@ -100,7 +101,7 @@ void __kernel_fpu_end(void) stts(); } - this_cpu_write(in_kernel_fpu, false); + kernel_fpu_enable(); } EXPORT_SYMBOL(__kernel_fpu_end); -- cgit v1.2.1 From 085cc281a04633761bac361f26dcee2800d58077 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 16:52:03 +0200 Subject: x86/fpu: Add kernel_fpu_disabled() Instead of open-coded in_kernel_fpu access, Use kernel_fpu_disabled() in interrupted_kernel_fpu_idle(), matching the other kernel_fpu_*() methods. Also add some documentation for in_kernel_fpu. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9bc573a5c9db..e898e83afa0a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -7,6 +7,17 @@ */ #include +/* + * Track whether the kernel is using the FPU state + * currently. + * + * This flag is used: + * + * - by IRQ context code to potentially use the FPU + * if it's unused. + * + * - to debug kernel_fpu_begin()/end() correctness + */ static DEFINE_PER_CPU(bool, in_kernel_fpu); static void kernel_fpu_disable(void) @@ -21,6 +32,11 @@ static void kernel_fpu_enable(void) this_cpu_write(in_kernel_fpu, false); } +static bool kernel_fpu_disabled(void) +{ + return this_cpu_read(in_kernel_fpu); +} + /* * Were we in an interrupt that interrupted kernel mode? * @@ -35,7 +51,7 @@ static void kernel_fpu_enable(void) */ static bool interrupted_kernel_fpu_idle(void) { - if (this_cpu_read(in_kernel_fpu)) + if (kernel_fpu_disabled()) return false; if (use_eager_fpu()) -- cgit v1.2.1 From 6522d783773d0d61f9f35cae890f8c11c4510d9a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 19:54:09 +0200 Subject: x86/fpu: Remove __save_init_fpu() __save_init_fpu() is just a trivial wrapper around fpu_save_init(). Remove the extra layer of obfuscation. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 7 +------ arch/x86/kernel/fpu/core.c | 4 ++-- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 16a1c66cf4ee..1e2b6c67b1f1 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -295,11 +295,6 @@ static inline int fpu_save_init(struct fpu *fpu) return 1; } -static inline int __save_init_fpu(struct task_struct *tsk) -{ - return fpu_save_init(&tsk->thread.fpu); -} - static inline int fpu_restore_checking(struct fpu *fpu) { if (use_xsave()) @@ -439,7 +434,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta (use_eager_fpu() || new->thread.fpu.counter > 5); if (__thread_has_fpu(old)) { - if (!__save_init_fpu(old)) + if (!fpu_save_init(&old->thread.fpu)) task_disable_lazy_fpu_restore(old); else old->thread.fpu.last_cpu = cpu; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e898e83afa0a..6ce971ccd85b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -97,7 +97,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); if (__thread_has_fpu(me)) { - __save_init_fpu(me); + fpu_save_init(&me->thread.fpu); } else { this_cpu_write(fpu_owner_task, NULL); if (!use_eager_fpu()) @@ -135,7 +135,7 @@ void fpu__save(struct task_struct *tsk) if (use_eager_fpu()) { __save_fpu(tsk); } else { - __save_init_fpu(tsk); + fpu_save_init(&tsk->thread.fpu); __thread_fpu_end(tsk); } } -- cgit v1.2.1 From e102f30f4e22b7eb8f3dfbe7fec334cffb350fd8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 22 Apr 2015 20:09:29 +0200 Subject: x86/fpu: Move fpu_copy() to fpu/core.c Move fpu_copy() where its only user is. Beyond readability this also speeds up compilation, as fpu-internal.h is included in over a dozen .c files. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 14 -------------- arch/x86/kernel/fpu/core.c | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 1e2b6c67b1f1..e180fb96dd0d 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -565,20 +565,6 @@ extern int fpstate_alloc(struct fpu *fpu); extern void fpstate_free(struct fpu *fpu); extern int fpu__copy(struct task_struct *dst, struct task_struct *src); -static inline void fpu_copy(struct task_struct *dst, struct task_struct *src) -{ - if (use_eager_fpu()) { - memset(&dst->thread.fpu.state->xsave, 0, xstate_size); - __save_fpu(dst); - } else { - struct fpu *dfpu = &dst->thread.fpu; - struct fpu *sfpu = &src->thread.fpu; - - fpu__save(src); - memcpy(dfpu->state, sfpu->state, xstate_size); - } -} - static inline unsigned long alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 6ce971ccd85b..2cc2380b95ce 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -203,6 +203,20 @@ void fpstate_free(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpstate_free); +static void fpu_copy(struct task_struct *dst, struct task_struct *src) +{ + if (use_eager_fpu()) { + memset(&dst->thread.fpu.state->xsave, 0, xstate_size); + __save_fpu(dst); + } else { + struct fpu *dfpu = &dst->thread.fpu; + struct fpu *sfpu = &src->thread.fpu; + + fpu__save(src); + memcpy(dfpu->state, sfpu->state, xstate_size); + } +} + int fpu__copy(struct task_struct *dst, struct task_struct *src) { dst->thread.fpu.counter = 0; -- cgit v1.2.1 From bfd6fc0581e7e2f3fa0b3e5e21cd6e54c3fbd16f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 08:55:34 +0200 Subject: x86/fpu: Add debugging check to fpu_copy() Also add a bit of documentation. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 2cc2380b95ce..3aeab3f12835 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -203,8 +203,18 @@ void fpstate_free(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpstate_free); +/* + * Copy the current task's FPU state to a new task's FPU context. + * + * In the 'eager' case we just save to the destination context. + * + * In the 'lazy' case we save to the source context, mark the FPU lazy + * via stts() and copy the source context into the destination context. + */ static void fpu_copy(struct task_struct *dst, struct task_struct *src) { + WARN_ON(src != current); + if (use_eager_fpu()) { memset(&dst->thread.fpu.state->xsave, 0, xstate_size); __save_fpu(dst); -- cgit v1.2.1 From 9a89b02918c03d91e685d06e49a652da5f35befc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 11:26:08 +0200 Subject: x86/fpu: Print out whether we are doing lazy/eager FPU context switches Ever since the kernel started defaulting to eager FPU switches on modern Intel CPUs it's not been obvious whether a given system is using the lazy or the eager FPU context switching logic. So generate a boot message about which mode the FPU code is in: x86/fpu: Using 'lazy' FPU context switches. or: x86/fpu: Using 'eager' FPU context switches. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index a52205b87acb..61696c5005eb 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -691,6 +691,8 @@ void __init_refok eager_fpu_init(void) if (eagerfpu == ENABLE) setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + printk_once(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); + if (!cpu_has_eager_fpu) { stts(); return; -- cgit v1.2.1 From 276983f8085db4a5f4e2cdcda6bce29a1da97eb0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 11:55:18 +0200 Subject: x86/fpu: Eliminate the __thread_has_fpu() wrapper Start migrating FPU methods towards using 'struct fpu *fpu' directly. __thread_has_fpu() is just a trivial wrapper around fpu->has_fpu, eliminate it. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 16 ++++------------ arch/x86/kernel/fpu/core.c | 17 ++++++++++------- 2 files changed, 14 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index e180fb96dd0d..c005d1fc1247 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -323,16 +323,6 @@ static inline int restore_fpu_checking(struct task_struct *tsk) return fpu_restore_checking(&tsk->thread.fpu); } -/* - * Software FPU state helpers. Careful: these need to - * be preemption protection *and* they need to be - * properly paired with the CR0.TS changes! - */ -static inline int __thread_has_fpu(struct task_struct *tsk) -{ - return tsk->thread.fpu.has_fpu; -} - /* Must be paired with an 'stts' after! */ static inline void __thread_clear_has_fpu(struct task_struct *tsk) { @@ -370,13 +360,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) static inline void drop_fpu(struct task_struct *tsk) { + struct fpu *fpu = &tsk->thread.fpu; /* * Forget coprocessor state.. */ preempt_disable(); tsk->thread.fpu.counter = 0; - if (__thread_has_fpu(tsk)) { + if (fpu->has_fpu) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -424,6 +415,7 @@ typedef struct { int preload; } fpu_switch_t; static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { + struct fpu *old_fpu = &old->thread.fpu; fpu_switch_t fpu; /* @@ -433,7 +425,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta fpu.preload = tsk_used_math(new) && (use_eager_fpu() || new->thread.fpu.counter > 5); - if (__thread_has_fpu(old)) { + if (old_fpu->has_fpu) { if (!fpu_save_init(&old->thread.fpu)) task_disable_lazy_fpu_restore(old); else diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3aeab3f12835..29b837730a07 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -57,8 +57,7 @@ static bool interrupted_kernel_fpu_idle(void) if (use_eager_fpu()) return true; - return !__thread_has_fpu(current) && - (read_cr0() & X86_CR0_TS); + return !current->thread.fpu.has_fpu && (read_cr0() & X86_CR0_TS); } /* @@ -93,11 +92,12 @@ EXPORT_SYMBOL(irq_fpu_usable); void __kernel_fpu_begin(void) { struct task_struct *me = current; + struct fpu *fpu = &me->thread.fpu; kernel_fpu_disable(); - if (__thread_has_fpu(me)) { - fpu_save_init(&me->thread.fpu); + if (fpu->has_fpu) { + fpu_save_init(fpu); } else { this_cpu_write(fpu_owner_task, NULL); if (!use_eager_fpu()) @@ -109,8 +109,9 @@ EXPORT_SYMBOL(__kernel_fpu_begin); void __kernel_fpu_end(void) { struct task_struct *me = current; + struct fpu *fpu = &me->thread.fpu; - if (__thread_has_fpu(me)) { + if (fpu->has_fpu) { if (WARN_ON(restore_fpu_checking(me))) fpu_reset_state(me); } else if (!use_eager_fpu()) { @@ -128,14 +129,16 @@ EXPORT_SYMBOL(__kernel_fpu_end); */ void fpu__save(struct task_struct *tsk) { + struct fpu *fpu = &tsk->thread.fpu; + WARN_ON(tsk != current); preempt_disable(); - if (__thread_has_fpu(tsk)) { + if (fpu->has_fpu) { if (use_eager_fpu()) { __save_fpu(tsk); } else { - fpu_save_init(&tsk->thread.fpu); + fpu_save_init(fpu); __thread_fpu_end(tsk); } } -- cgit v1.2.1 From 36fe6175be15d33fec7c6aa53e6e202ad44f0b25 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:08:58 +0200 Subject: x86/fpu: Change __thread_clear_has_fpu() to 'struct fpu' parameter We do this to make the code more readable, and also to be able to eliminate task_struct usage from most of the FPU code. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index c005d1fc1247..94c068b6238e 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -324,9 +324,9 @@ static inline int restore_fpu_checking(struct task_struct *tsk) } /* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct task_struct *tsk) +static inline void __thread_clear_has_fpu(struct fpu *fpu) { - tsk->thread.fpu.has_fpu = 0; + fpu->has_fpu = 0; this_cpu_write(fpu_owner_task, NULL); } @@ -346,7 +346,7 @@ static inline void __thread_set_has_fpu(struct task_struct *tsk) */ static inline void __thread_fpu_end(struct task_struct *tsk) { - __thread_clear_has_fpu(tsk); + __thread_clear_has_fpu(&tsk->thread.fpu); if (!use_eager_fpu()) stts(); } -- cgit v1.2.1 From b0c050c5ba130c0ccb1b86b64f162a4601d160c7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:13:04 +0200 Subject: x86/fpu: Move 'PER_CPU(fpu_owner_task)' to fpu/core.c Move it closer to other per-cpu FPU data structures. This also unifies the 32-bit and 64-bit code. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 3 --- arch/x86/kernel/fpu/core.c | 5 +++++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 220ad95e0e28..88bb7a75f5c6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1182,8 +1182,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); - /* * Special IST stacks which the CPU switches to when it calls * an IST-marked descriptor entry. Up to 7 stacks (hardware @@ -1274,7 +1272,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); /* * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 29b837730a07..d29fec70e6b3 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -20,6 +20,11 @@ */ static DEFINE_PER_CPU(bool, in_kernel_fpu); +/* + * Track which task is using the FPU on the CPU: + */ +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); + static void kernel_fpu_disable(void) { WARN_ON(this_cpu_read(in_kernel_fpu)); -- cgit v1.2.1 From 36b544dcd3f935bd33ada700d070433a57982771 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:18:28 +0200 Subject: x86/fpu: Change fpu_owner_task to fpu_fpregs_owner_ctx Track the FPU owner context instead of the owner task: this change, together with other changes, will allow in subsequent patches the elimination of 'struct task_struct' usage in various FPU code: we'll be able to use 'struct fpu' only. There's no change in code size: text data bss dec hex filename 13066467 2545248 1626112 17237827 1070743 vmlinux.before 13066467 2545248 1626112 17237827 1070743 vmlinux.after Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 14 +++++++------- arch/x86/kernel/fpu/core.c | 9 ++++----- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 94c068b6238e..d0fe7bbb51d1 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -37,7 +37,7 @@ extern unsigned int mxcsr_feature_mask; extern void fpu__cpu_init(void); extern void eager_fpu_init(void); -DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); +DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); extern void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk); @@ -63,7 +63,7 @@ static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif /* - * Must be run with preemption disabled: this clears the fpu_owner_task, + * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, * on this CPU. * * This will disable any lazy FPU state restore of the current FPU state, @@ -71,7 +71,7 @@ static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} */ static inline void __cpu_disable_lazy_restore(unsigned int cpu) { - per_cpu(fpu_owner_task, cpu) = NULL; + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; } /* @@ -86,7 +86,7 @@ static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk) static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { - return new == this_cpu_read_stable(fpu_owner_task) && + return &new->thread.fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == new->thread.fpu.last_cpu; } @@ -327,14 +327,14 @@ static inline int restore_fpu_checking(struct task_struct *tsk) static inline void __thread_clear_has_fpu(struct fpu *fpu) { fpu->has_fpu = 0; - this_cpu_write(fpu_owner_task, NULL); + this_cpu_write(fpu_fpregs_owner_ctx, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct task_struct *tsk) { tsk->thread.fpu.has_fpu = 1; - this_cpu_write(fpu_owner_task, tsk); + this_cpu_write(fpu_fpregs_owner_ctx, &tsk->thread.fpu); } /* @@ -431,7 +431,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta else old->thread.fpu.last_cpu = cpu; - /* But leave fpu_owner_task! */ + /* But leave fpu_fpregs_owner_ctx! */ old->thread.fpu.has_fpu = 0; /* Don't change CR0.TS if we just switch! */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d29fec70e6b3..ac390c690944 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -21,9 +21,9 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* - * Track which task is using the FPU on the CPU: + * Track which context is using the FPU on the CPU: */ -DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); static void kernel_fpu_disable(void) { @@ -96,15 +96,14 @@ EXPORT_SYMBOL(irq_fpu_usable); void __kernel_fpu_begin(void) { - struct task_struct *me = current; - struct fpu *fpu = &me->thread.fpu; + struct fpu *fpu = ¤t->thread.fpu; kernel_fpu_disable(); if (fpu->has_fpu) { fpu_save_init(fpu); } else { - this_cpu_write(fpu_owner_task, NULL); + this_cpu_write(fpu_fpregs_owner_ctx, NULL); if (!use_eager_fpu()) clts(); } -- cgit v1.2.1 From c0311f63e3cc8b29b6006ac8c1d4e3f6fbb2e357 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:24:59 +0200 Subject: x86/fpu: Remove 'struct task_struct' usage from __thread_set_has_fpu() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index d0fe7bbb51d1..cf0d4124fb3d 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -331,10 +331,10 @@ static inline void __thread_clear_has_fpu(struct fpu *fpu) } /* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct task_struct *tsk) +static inline void __thread_set_has_fpu(struct fpu *fpu) { - tsk->thread.fpu.has_fpu = 1; - this_cpu_write(fpu_fpregs_owner_ctx, &tsk->thread.fpu); + fpu->has_fpu = 1; + this_cpu_write(fpu_fpregs_owner_ctx, fpu); } /* @@ -355,7 +355,7 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) { if (!use_eager_fpu()) clts(); - __thread_set_has_fpu(tsk); + __thread_set_has_fpu(&tsk->thread.fpu); } static inline void drop_fpu(struct task_struct *tsk) @@ -416,6 +416,7 @@ typedef struct { int preload; } fpu_switch_t; static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { struct fpu *old_fpu = &old->thread.fpu; + struct fpu *new_fpu = &new->thread.fpu; fpu_switch_t fpu; /* @@ -437,7 +438,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { new->thread.fpu.counter++; - __thread_set_has_fpu(new); + __thread_set_has_fpu(new_fpu); prefetch(new->thread.fpu.state); } else if (!use_eager_fpu()) stts(); -- cgit v1.2.1 From 35191e3f073c442b201f8beb5315561271d2327a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:26:55 +0200 Subject: x86/fpu: Remove 'struct task_struct' usage from __thread_fpu_end() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 6 +++--- arch/x86/kernel/fpu/core.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index cf0d4124fb3d..b1803a656651 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -344,9 +344,9 @@ static inline void __thread_set_has_fpu(struct fpu *fpu) * These generally need preemption protection to work, * do try to avoid using these on their own. */ -static inline void __thread_fpu_end(struct task_struct *tsk) +static inline void __thread_fpu_end(struct fpu *fpu) { - __thread_clear_has_fpu(&tsk->thread.fpu); + __thread_clear_has_fpu(fpu); if (!use_eager_fpu()) stts(); } @@ -372,7 +372,7 @@ static inline void drop_fpu(struct task_struct *tsk) asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(tsk); + __thread_fpu_end(fpu); } clear_stopped_child_used_math(tsk); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ac390c690944..4e1f8f1bf493 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -143,7 +143,7 @@ void fpu__save(struct task_struct *tsk) __save_fpu(tsk); } else { fpu_save_init(fpu); - __thread_fpu_end(tsk); + __thread_fpu_end(fpu); } } preempt_enable(); -- cgit v1.2.1 From 4540d3faa7c3fca6a6125448861de0e2e485658b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:31:17 +0200 Subject: x86/fpu: Remove 'struct task_struct' usage from __thread_fpu_begin() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 10 ++++++---- arch/x86/kernel/fpu/core.c | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index b1803a656651..44516ad6c890 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -351,11 +351,11 @@ static inline void __thread_fpu_end(struct fpu *fpu) stts(); } -static inline void __thread_fpu_begin(struct task_struct *tsk) +static inline void __thread_fpu_begin(struct fpu *fpu) { if (!use_eager_fpu()) clts(); - __thread_set_has_fpu(&tsk->thread.fpu); + __thread_set_has_fpu(fpu); } static inline void drop_fpu(struct task_struct *tsk) @@ -451,7 +451,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta fpu.preload = 0; else prefetch(new->thread.fpu.state); - __thread_fpu_begin(new); + __thread_fpu_begin(new_fpu); } } return fpu; @@ -505,9 +505,11 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame) */ static inline void user_fpu_begin(void) { + struct fpu *fpu = ¤t->thread.fpu; + preempt_disable(); if (!user_has_fpu()) - __thread_fpu_begin(current); + __thread_fpu_begin(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 4e1f8f1bf493..cf49cd574d32 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -329,6 +329,7 @@ static int fpu__unlazy_stopped(struct task_struct *child) void fpu__restore(void) { struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; if (!tsk_used_math(tsk)) { local_irq_enable(); @@ -347,7 +348,7 @@ void fpu__restore(void) /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */ kernel_fpu_disable(); - __thread_fpu_begin(tsk); + __thread_fpu_begin(fpu); if (unlikely(restore_fpu_checking(tsk))) { fpu_reset_state(tsk); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); -- cgit v1.2.1 From 4c1384100ebf51651d02430a7f70661ef1ef06ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:46:20 +0200 Subject: x86/fpu: Open code PF_USED_MATH usages PF_USED_MATH is used directly, but also in a handful of helper inlines. To ease the elimination of PF_USED_MATH, convert all inline helpers to open-coded PF_USED_MATH usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu-internal.h | 5 +++-- arch/x86/kernel/fpu/core.c | 14 ++++++++------ arch/x86/kernel/fpu/xsave.c | 10 +++++----- arch/x86/kernel/signal.c | 6 +++--- arch/x86/kvm/x86.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- 7 files changed, 22 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 4bafd5b05aca..bffb2c49ceb6 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -321,7 +321,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (used_math()) { + if (current->flags & PF_USED_MATH) { unsigned long fx_aligned, math_size; sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 44516ad6c890..2cac49e3b4bd 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -375,7 +375,8 @@ static inline void drop_fpu(struct task_struct *tsk) __thread_fpu_end(fpu); } - clear_stopped_child_used_math(tsk); + tsk->flags &= ~PF_USED_MATH; + preempt_enable(); } @@ -423,7 +424,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = tsk_used_math(new) && + fpu.preload = (new->flags & PF_USED_MATH) && (use_eager_fpu() || new->thread.fpu.counter > 5); if (old_fpu->has_fpu) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index cf49cd574d32..90f624d68b26 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -242,7 +242,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src) task_disable_lazy_fpu_restore(dst); - if (tsk_used_math(src)) { + if (src->flags & PF_USED_MATH) { int err = fpstate_alloc(&dst->thread.fpu); if (err) @@ -331,7 +331,7 @@ void fpu__restore(void) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - if (!tsk_used_math(tsk)) { + if (!(tsk->flags & PF_USED_MATH)) { local_irq_enable(); /* * does a slab alloc which can sleep @@ -361,12 +361,14 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__flush_thread(struct task_struct *tsk) { + WARN_ON(tsk != current); + if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ drop_fpu(tsk); fpstate_free(&tsk->thread.fpu); } else { - if (!tsk_used_math(tsk)) { + if (!(tsk->flags & PF_USED_MATH)) { /* kthread execs. TODO: cleanup this horror. */ if (WARN_ON(fpstate_alloc_init(tsk))) force_sig(SIGKILL, tsk); @@ -383,12 +385,12 @@ void fpu__flush_thread(struct task_struct *tsk) */ int fpregs_active(struct task_struct *target, const struct user_regset *regset) { - return tsk_used_math(target) ? regset->n : 0; + return (target->flags & PF_USED_MATH) ? regset->n : 0; } int xfpregs_active(struct task_struct *target, const struct user_regset *regset) { - return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0; + return (cpu_has_fxsr && (target->flags & PF_USED_MATH)) ? regset->n : 0; } int xfpregs_get(struct task_struct *target, const struct user_regset *regset, @@ -719,7 +721,7 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) struct task_struct *tsk = current; int fpvalid; - fpvalid = !!used_math(); + fpvalid = !!(tsk->flags & PF_USED_MATH); if (fpvalid) fpvalid = !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 61696c5005eb..8cd127049c9b 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -349,7 +349,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - if (!used_math() && fpstate_alloc_init(tsk)) + if (!(tsk->flags & PF_USED_MATH) && fpstate_alloc_init(tsk)) return -1; if (!static_cpu_has(X86_FEATURE_FPU)) @@ -384,12 +384,12 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) int err = 0; /* - * Drop the current fpu which clears used_math(). This ensures + * Drop the current fpu which clears PF_USED_MATH. This ensures * that any context-switch during the copy of the new state, * avoids the intermediate state from getting restored/saved. * Thus avoiding the new restored state from getting corrupted. * We will be ready to restore/save the state only after - * set_used_math() is again set. + * PF_USED_MATH is again set. */ drop_fpu(tsk); @@ -401,7 +401,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); } - set_used_math(); + tsk->flags |= PF_USED_MATH; if (use_eager_fpu()) { preempt_disable(); fpu__restore(); @@ -685,7 +685,7 @@ void xsave_init(void) */ void __init_refok eager_fpu_init(void) { - WARN_ON(used_math()); + WARN_ON(current->flags & PF_USED_MATH); current_thread_info()->status = 0; if (eagerfpu == ENABLE) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 35f867aa597e..8e2529ebb8c6 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -217,7 +217,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } } - if (used_math()) { + if (current->flags & PF_USED_MATH) { sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; @@ -233,7 +233,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (used_math() && + if ((current->flags & PF_USED_MATH) && save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; @@ -664,7 +664,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (used_math()) + if (current->flags & PF_USED_MATH) fpu_reset_state(current); } signal_setup_done(failed, ksig, stepping); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be276e0fe0ff..0635a1fd43ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6600,7 +6600,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - if (!tsk_used_math(current) && fpstate_alloc_init(current)) + if (!(current->flags & PF_USED_MATH) && fpstate_alloc_init(current)) return -ENOMEM; if (vcpu->sigset_active) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index c9ff09a02385..bf628804d67c 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -148,7 +148,7 @@ void math_emulate(struct math_emu_info *info) unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ struct desc_struct code_descriptor; - if (!used_math()) { + if (!(current->flags & PF_USED_MATH)) { if (fpstate_alloc_init(current)) { do_group_exit(SIGKILL); return; -- cgit v1.2.1 From af7f8721f1f1252473b154c38dd7583abfe3206b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 14:06:05 +0200 Subject: x86/fpu: Document fpu__unlazy_stopped() Explain its usage and also document a TODO item. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 90f624d68b26..779813126f49 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -284,10 +284,27 @@ int fpstate_alloc_init(struct task_struct *curr) EXPORT_SYMBOL_GPL(fpstate_alloc_init); /* - * The _current_ task is using the FPU for the first time - * so initialize it and set the mxcsr to its default - * value at reset if we support XMM instructions and then - * remember the current task has used the FPU. + * This function is called before we modify a stopped child's + * FPU state context. + * + * If the child has not used the FPU before then initialize its + * FPU context. + * + * If the child has used the FPU before then unlazy it. + * + * [ After this function call, after the context is modified and + * the child task is woken up, the child task will restore + * the modified FPU state from the modified context. If we + * didn't clear its lazy status here then the lazy in-registers + * state pending on its former CPU could be restored, losing + * the modifications. ] + * + * This function is also called before we read a stopped child's + * FPU state - to make sure it's modified. + * + * TODO: A future optimization would be to skip the unlazying in + * the read-only case, it's not strictly necessary for + * read-only access to the context. */ static int fpu__unlazy_stopped(struct task_struct *child) { -- cgit v1.2.1 From c5bedc6847c3be6efe0e671a6155c9a25fd468bf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:49:20 +0200 Subject: x86/fpu: Get rid of PF_USED_MATH usage, convert it to fpu->fpstate_active Introduce a simple fpu->fpstate_active flag in the fpu context data structure and use that instead of PF_USED_MATH in task->flags. Testing for this flag byte should be slightly more efficient than testing a bit in a bitmask, but the main advantage is that most FPU functions can now be performed on a 'struct fpu' alone, they don't need access to 'struct task_struct' anymore. There's a slight linecount increase, mostly due to the 'fpu' local variables and due to extra comments. The local variables will go away once we move most of the FPU methods to pure 'struct fpu' parameters. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 3 ++- arch/x86/include/asm/fpu-internal.h | 4 ++-- arch/x86/include/asm/fpu/types.h | 6 ++++++ arch/x86/include/asm/processor.h | 6 ++++-- arch/x86/kernel/fpu/core.c | 38 ++++++++++++++++++++++++------------- arch/x86/kernel/fpu/xsave.c | 11 ++++++----- arch/x86/kernel/signal.c | 8 +++++--- arch/x86/kvm/x86.c | 3 ++- arch/x86/math-emu/fpu_entry.c | 3 ++- 9 files changed, 54 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index bffb2c49ceb6..e1ec6f90d09e 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -307,6 +307,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { + struct fpu *fpu = ¤t->thread.fpu; unsigned long sp; /* Default to using normal stack */ @@ -321,7 +322,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (current->flags & PF_USED_MATH) { + if (fpu->fpstate_active) { unsigned long fx_aligned, math_size; sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 2cac49e3b4bd..9311126571ab 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -375,7 +375,7 @@ static inline void drop_fpu(struct task_struct *tsk) __thread_fpu_end(fpu); } - tsk->flags &= ~PF_USED_MATH; + fpu->fpstate_active = 0; preempt_enable(); } @@ -424,7 +424,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = (new->flags & PF_USED_MATH) && + fpu.preload = new_fpu->fpstate_active && (use_eager_fpu() || new->thread.fpu.counter > 5); if (old_fpu->has_fpu) { diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index efb520dcf38e..f6317d9aa808 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -137,6 +137,12 @@ struct fpu { * deal with bursty apps that only use the FPU for a short time: */ unsigned char counter; + /* + * This flag indicates whether this context is fpstate_active: if the task is + * not running then we can restore from this context, if the task + * is running then we should save into this context. + */ + unsigned char fpstate_active; }; #endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d50cc7f61559..0f4add462697 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -385,6 +385,10 @@ struct thread_struct { unsigned long fs; #endif unsigned long gs; + + /* Floating point and extended processor state */ + struct fpu fpu; + /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ @@ -395,8 +399,6 @@ struct thread_struct { unsigned long cr2; unsigned long trap_nr; unsigned long error_code; - /* floating point and extended processor state */ - struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ struct vm86_struct __user *vm86_info; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 779813126f49..9e7f9e7b2cca 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -236,14 +236,17 @@ static void fpu_copy(struct task_struct *dst, struct task_struct *src) int fpu__copy(struct task_struct *dst, struct task_struct *src) { + struct fpu *dst_fpu = &dst->thread.fpu; + struct fpu *src_fpu = &src->thread.fpu; + dst->thread.fpu.counter = 0; dst->thread.fpu.has_fpu = 0; dst->thread.fpu.state = NULL; task_disable_lazy_fpu_restore(dst); - if (src->flags & PF_USED_MATH) { - int err = fpstate_alloc(&dst->thread.fpu); + if (src_fpu->fpstate_active) { + int err = fpstate_alloc(dst_fpu); if (err) return err; @@ -260,11 +263,12 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src) */ int fpstate_alloc_init(struct task_struct *curr) { + struct fpu *fpu = &curr->thread.fpu; int ret; if (WARN_ON_ONCE(curr != current)) return -EINVAL; - if (WARN_ON_ONCE(curr->flags & PF_USED_MATH)) + if (WARN_ON_ONCE(fpu->fpstate_active)) return -EINVAL; /* @@ -277,7 +281,7 @@ int fpstate_alloc_init(struct task_struct *curr) fpstate_init(&curr->thread.fpu); /* Safe to do for the current task: */ - curr->flags |= PF_USED_MATH; + fpu->fpstate_active = 1; return 0; } @@ -308,12 +312,13 @@ EXPORT_SYMBOL_GPL(fpstate_alloc_init); */ static int fpu__unlazy_stopped(struct task_struct *child) { + struct fpu *child_fpu = &child->thread.fpu; int ret; if (WARN_ON_ONCE(child == current)) return -EINVAL; - if (child->flags & PF_USED_MATH) { + if (child_fpu->fpstate_active) { task_disable_lazy_fpu_restore(child); return 0; } @@ -328,7 +333,7 @@ static int fpu__unlazy_stopped(struct task_struct *child) fpstate_init(&child->thread.fpu); /* Safe to do for stopped child tasks: */ - child->flags |= PF_USED_MATH; + child_fpu->fpstate_active = 1; return 0; } @@ -348,7 +353,7 @@ void fpu__restore(void) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - if (!(tsk->flags & PF_USED_MATH)) { + if (!fpu->fpstate_active) { local_irq_enable(); /* * does a slab alloc which can sleep @@ -378,6 +383,8 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__flush_thread(struct task_struct *tsk) { + struct fpu *fpu = &tsk->thread.fpu; + WARN_ON(tsk != current); if (!use_eager_fpu()) { @@ -385,7 +392,7 @@ void fpu__flush_thread(struct task_struct *tsk) drop_fpu(tsk); fpstate_free(&tsk->thread.fpu); } else { - if (!(tsk->flags & PF_USED_MATH)) { + if (!fpu->fpstate_active) { /* kthread execs. TODO: cleanup this horror. */ if (WARN_ON(fpstate_alloc_init(tsk))) force_sig(SIGKILL, tsk); @@ -402,12 +409,16 @@ void fpu__flush_thread(struct task_struct *tsk) */ int fpregs_active(struct task_struct *target, const struct user_regset *regset) { - return (target->flags & PF_USED_MATH) ? regset->n : 0; + struct fpu *target_fpu = &target->thread.fpu; + + return target_fpu->fpstate_active ? regset->n : 0; } int xfpregs_active(struct task_struct *target, const struct user_regset *regset) { - return (cpu_has_fxsr && (target->flags & PF_USED_MATH)) ? regset->n : 0; + struct fpu *target_fpu = &target->thread.fpu; + + return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0; } int xfpregs_get(struct task_struct *target, const struct user_regset *regset, @@ -733,16 +744,17 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * struct user_i387_struct) but is in fact only used for 32-bit * dumps, so on 64-bit it is really struct user_i387_ia32_struct. */ -int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) +int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) { struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; int fpvalid; - fpvalid = !!(tsk->flags & PF_USED_MATH); + fpvalid = fpu->fpstate_active; if (fpvalid) fpvalid = !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), - fpu, NULL); + ufpu, NULL); return fpvalid; } diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 8cd127049c9b..dc346e19c0df 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -334,6 +334,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) { int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; int state_size = xstate_size; u64 xstate_bv = 0; int fx_only = 0; @@ -349,7 +350,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - if (!(tsk->flags & PF_USED_MATH) && fpstate_alloc_init(tsk)) + if (!fpu->fpstate_active && fpstate_alloc_init(tsk)) return -1; if (!static_cpu_has(X86_FEATURE_FPU)) @@ -384,12 +385,12 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) int err = 0; /* - * Drop the current fpu which clears PF_USED_MATH. This ensures + * Drop the current fpu which clears fpu->fpstate_active. This ensures * that any context-switch during the copy of the new state, * avoids the intermediate state from getting restored/saved. * Thus avoiding the new restored state from getting corrupted. * We will be ready to restore/save the state only after - * PF_USED_MATH is again set. + * fpu->fpstate_active is again set. */ drop_fpu(tsk); @@ -401,7 +402,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); } - tsk->flags |= PF_USED_MATH; + fpu->fpstate_active = 1; if (use_eager_fpu()) { preempt_disable(); fpu__restore(); @@ -685,7 +686,7 @@ void xsave_init(void) */ void __init_refok eager_fpu_init(void) { - WARN_ON(current->flags & PF_USED_MATH); + WARN_ON(current->thread.fpu.fpstate_active); current_thread_info()->status = 0; if (eagerfpu == ENABLE) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 8e2529ebb8c6..20a9d355af59 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -198,6 +198,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long sp = regs->sp; unsigned long buf_fx = 0; int onsigstack = on_sig_stack(sp); + struct fpu *fpu = ¤t->thread.fpu; /* redzone */ if (config_enabled(CONFIG_X86_64)) @@ -217,7 +218,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } } - if (current->flags & PF_USED_MATH) { + if (fpu->fpstate_active) { sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; @@ -233,7 +234,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if ((current->flags & PF_USED_MATH) && + if (fpu->fpstate_active && save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; @@ -616,6 +617,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; + struct fpu *fpu = ¤t->thread.fpu; /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { @@ -664,7 +666,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (current->flags & PF_USED_MATH) + if (fpu->fpstate_active) fpu_reset_state(current); } signal_setup_done(failed, ksig, stepping); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0635a1fd43ba..bab8afb61dc1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6597,10 +6597,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct fpu *fpu = ¤t->thread.fpu; int r; sigset_t sigsaved; - if (!(current->flags & PF_USED_MATH) && fpstate_alloc_init(current)) + if (!fpu->fpstate_active && fpstate_alloc_init(current)) return -ENOMEM; if (vcpu->sigset_active) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index bf628804d67c..f1aac55d6a67 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -147,8 +147,9 @@ void math_emulate(struct math_emu_info *info) unsigned long code_base = 0; unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ struct desc_struct code_descriptor; + struct fpu *fpu = ¤t->thread.fpu; - if (!(current->flags & PF_USED_MATH)) { + if (!fpu->fpstate_active) { if (fpstate_alloc_init(current)) { do_group_exit(SIGKILL); return; -- cgit v1.2.1 From ca6787ba0fcc875cfb06dc2a538ac23210b7d251 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 12:33:50 +0200 Subject: x86/fpu: Remove 'struct task_struct' usage from drop_fpu() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 9 +++++---- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/xsave.c | 2 +- arch/x86/kernel/process.c | 3 ++- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 9311126571ab..e8f7134f0ffb 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -358,14 +358,13 @@ static inline void __thread_fpu_begin(struct fpu *fpu) __thread_set_has_fpu(fpu); } -static inline void drop_fpu(struct task_struct *tsk) +static inline void drop_fpu(struct fpu *fpu) { - struct fpu *fpu = &tsk->thread.fpu; /* * Forget coprocessor state.. */ preempt_disable(); - tsk->thread.fpu.counter = 0; + fpu->counter = 0; if (fpu->has_fpu) { /* Ignore delayed exceptions from user space */ @@ -394,8 +393,10 @@ static inline void restore_init_xstate(void) */ static inline void fpu_reset_state(struct task_struct *tsk) { + struct fpu *fpu = &tsk->thread.fpu; + if (!use_eager_fpu()) - drop_fpu(tsk); + drop_fpu(fpu); else restore_init_xstate(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9e7f9e7b2cca..ba539fc018d7 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -389,7 +389,7 @@ void fpu__flush_thread(struct task_struct *tsk) if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ - drop_fpu(tsk); + drop_fpu(fpu); fpstate_free(&tsk->thread.fpu); } else { if (!fpu->fpstate_active) { diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index dc346e19c0df..049dc619481d 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -392,7 +392,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) * We will be ready to restore/save the state only after * fpu->fpstate_active is again set. */ - drop_fpu(tsk); + drop_fpu(fpu); if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bb7d4abcdad6..50d503a2d8c3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -104,6 +104,7 @@ void exit_thread(void) struct task_struct *me = current; struct thread_struct *t = &me->thread; unsigned long *bp = t->io_bitmap_ptr; + struct fpu *fpu = &t->fpu; if (bp) { struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); @@ -119,7 +120,7 @@ void exit_thread(void) kfree(bp); } - drop_fpu(me); + drop_fpu(fpu); } void flush_thread(void) -- cgit v1.2.1 From eb6a3251bfe34f327570993e9a95dbf3a592b912 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:08:41 +0200 Subject: x86/fpu: Remove task_disable_lazy_fpu_restore() Replace task_disable_lazy_fpu_restore() with easier to read open-coded uses: we already update the fpu->last_cpu field explicitly in other cases. (This also removes yet another task_struct using FPU method.) Better explain the fpu::last_cpu field in the structure definition. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 14 ++------------ arch/x86/include/asm/fpu/types.h | 11 +++++++++++ arch/x86/kernel/fpu/core.c | 5 ++--- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index e8f7134f0ffb..76a1f3529881 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -74,16 +74,6 @@ static inline void __cpu_disable_lazy_restore(unsigned int cpu) per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; } -/* - * Used to indicate that the FPU state in memory is newer than the FPU - * state in registers, and the FPU state should be reloaded next time the - * task is run. Only safe on the current task, or non-running tasks. - */ -static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk) -{ - tsk->thread.fpu.last_cpu = ~0; -} - static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { return &new->thread.fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && @@ -430,7 +420,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta if (old_fpu->has_fpu) { if (!fpu_save_init(&old->thread.fpu)) - task_disable_lazy_fpu_restore(old); + old->thread.fpu.last_cpu = -1; else old->thread.fpu.last_cpu = cpu; @@ -446,7 +436,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->thread.fpu.counter = 0; - task_disable_lazy_fpu_restore(old); + old->thread.fpu.last_cpu = -1; if (fpu.preload) { new->thread.fpu.counter++; if (fpu_lazy_restore(new, cpu)) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index f6317d9aa808..cad1c37d9ea2 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -125,7 +125,18 @@ union thread_xstate { }; struct fpu { + /* + * Records the last CPU on which this context was loaded into + * FPU registers. (In the lazy-switching case we might be + * able to reuse FPU registers across multiple context switches + * this way, if no intermediate task used the FPU.) + * + * A value of -1 is used to indicate that the FPU state in context + * memory is newer than the FPU state in registers, and that the + * FPU state should be reloaded next time the task is run. + */ unsigned int last_cpu; + unsigned int has_fpu; union thread_xstate *state; /* diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ba539fc018d7..230e93783c99 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -242,8 +242,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src) dst->thread.fpu.counter = 0; dst->thread.fpu.has_fpu = 0; dst->thread.fpu.state = NULL; - - task_disable_lazy_fpu_restore(dst); + dst->thread.fpu.last_cpu = -1; if (src_fpu->fpstate_active) { int err = fpstate_alloc(dst_fpu); @@ -319,7 +318,7 @@ static int fpu__unlazy_stopped(struct task_struct *child) return -EINVAL; if (child_fpu->fpstate_active) { - task_disable_lazy_fpu_restore(child); + child->thread.fpu.last_cpu = -1; return 0; } -- cgit v1.2.1 From 66ddc2cb0f598818b39b66867007634244322843 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:25:44 +0200 Subject: x86/fpu: Use 'struct fpu' in fpu_lazy_restore() Also rename it to fpu_want_lazy_restore(), to better indicate that this function just tests whether we can do a lazy restore. (The old name suggested that it was doing the lazy restore, which is not the case.) Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 76a1f3529881..3f6d36c6ffce 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -74,10 +74,9 @@ static inline void __cpu_disable_lazy_restore(unsigned int cpu) per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; } -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) { - return &new->thread.fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && - cpu == new->thread.fpu.last_cpu; + return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } static inline int is_ia32_compat_frame(void) @@ -439,7 +438,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta old->thread.fpu.last_cpu = -1; if (fpu.preload) { new->thread.fpu.counter++; - if (fpu_lazy_restore(new, cpu)) + if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); -- cgit v1.2.1 From 11f2d50b10289f49676ec07bf3fef932473ef6d5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:30:59 +0200 Subject: x86/fpu: Use 'struct fpu' in restore_fpu_checking() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 10 ++++++---- arch/x86/kernel/fpu/core.c | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 3f6d36c6ffce..2d7934e4e394 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -294,7 +294,7 @@ static inline int fpu_restore_checking(struct fpu *fpu) return frstor_checking(&fpu->state->fsave); } -static inline int restore_fpu_checking(struct task_struct *tsk) +static inline int restore_fpu_checking(struct fpu *fpu) { /* * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is @@ -306,10 +306,10 @@ static inline int restore_fpu_checking(struct task_struct *tsk) "fnclex\n\t" "emms\n\t" "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (tsk->thread.fpu.has_fpu)); + : : [addr] "m" (fpu->has_fpu)); } - return fpu_restore_checking(&tsk->thread.fpu); + return fpu_restore_checking(fpu); } /* Must be paired with an 'stts' after! */ @@ -456,8 +456,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta */ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { + struct fpu *new_fpu = &new->thread.fpu; + if (fpu.preload) { - if (unlikely(restore_fpu_checking(new))) + if (unlikely(restore_fpu_checking(new_fpu))) fpu_reset_state(new); } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 230e93783c99..1ecd25028079 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) struct fpu *fpu = &me->thread.fpu; if (fpu->has_fpu) { - if (WARN_ON(restore_fpu_checking(me))) + if (WARN_ON(restore_fpu_checking(fpu))) fpu_reset_state(me); } else if (!use_eager_fpu()) { stts(); @@ -370,7 +370,7 @@ void fpu__restore(void) /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */ kernel_fpu_disable(); __thread_fpu_begin(fpu); - if (unlikely(restore_fpu_checking(tsk))) { + if (unlikely(restore_fpu_checking(fpu))) { fpu_reset_state(tsk); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); } else { -- cgit v1.2.1 From af2d94fddcf41e879908b35a8a5308fb94e989c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:34:20 +0200 Subject: x86/fpu: Use 'struct fpu' in fpu_reset_state() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 6 ++---- arch/x86/kernel/fpu/core.c | 7 +++---- arch/x86/kernel/fpu/xsave.c | 4 ++-- arch/x86/kernel/signal.c | 2 +- 4 files changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 2d7934e4e394..579f7d0a399d 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -380,10 +380,8 @@ static inline void restore_init_xstate(void) * Reset the FPU state in the eager case and drop it in the lazy case (later use * will reinit it). */ -static inline void fpu_reset_state(struct task_struct *tsk) +static inline void fpu_reset_state(struct fpu *fpu) { - struct fpu *fpu = &tsk->thread.fpu; - if (!use_eager_fpu()) drop_fpu(fpu); else @@ -460,7 +458,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) if (fpu.preload) { if (unlikely(restore_fpu_checking(new_fpu))) - fpu_reset_state(new); + fpu_reset_state(new_fpu); } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1ecd25028079..41c92897f574 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -112,12 +112,11 @@ EXPORT_SYMBOL(__kernel_fpu_begin); void __kernel_fpu_end(void) { - struct task_struct *me = current; - struct fpu *fpu = &me->thread.fpu; + struct fpu *fpu = ¤t->thread.fpu; if (fpu->has_fpu) { if (WARN_ON(restore_fpu_checking(fpu))) - fpu_reset_state(me); + fpu_reset_state(fpu); } else if (!use_eager_fpu()) { stts(); } @@ -371,7 +370,7 @@ void fpu__restore(void) kernel_fpu_disable(); __thread_fpu_begin(fpu); if (unlikely(restore_fpu_checking(fpu))) { - fpu_reset_state(tsk); + fpu_reset_state(fpu); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); } else { tsk->thread.fpu.counter++; diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 049dc619481d..3953cbf8d7e7 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -343,7 +343,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) config_enabled(CONFIG_IA32_EMULATION)); if (!buf) { - fpu_reset_state(tsk); + fpu_reset_state(fpu); return 0; } @@ -417,7 +417,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ user_fpu_begin(); if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { - fpu_reset_state(tsk); + fpu_reset_state(fpu); return -1; } } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 20a9d355af59..bcb853e44d30 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -667,7 +667,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * Ensure the signal handler starts with the new fpu state. */ if (fpu->fpstate_active) - fpu_reset_state(current); + fpu_reset_state(fpu); } signal_setup_done(failed, ksig, stepping); } -- cgit v1.2.1 From cb8818b6acb45a4e0acc2308df216f36cc5b950c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:39:04 +0200 Subject: x86/fpu: Use 'struct fpu' in switch_fpu_prepare() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 27 +++++++++++++-------------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 3 files changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 579f7d0a399d..60d2c6f376f3 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -402,10 +402,9 @@ static inline void fpu_reset_state(struct fpu *fpu) */ typedef struct { int preload; } fpu_switch_t; -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) +static inline fpu_switch_t +switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) { - struct fpu *old_fpu = &old->thread.fpu; - struct fpu *new_fpu = &new->thread.fpu; fpu_switch_t fpu; /* @@ -413,33 +412,33 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * or if the past 5 consecutive context-switches used math. */ fpu.preload = new_fpu->fpstate_active && - (use_eager_fpu() || new->thread.fpu.counter > 5); + (use_eager_fpu() || new_fpu->counter > 5); if (old_fpu->has_fpu) { - if (!fpu_save_init(&old->thread.fpu)) - old->thread.fpu.last_cpu = -1; + if (!fpu_save_init(old_fpu)) + old_fpu->last_cpu = -1; else - old->thread.fpu.last_cpu = cpu; + old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ - old->thread.fpu.has_fpu = 0; + old_fpu->has_fpu = 0; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new->thread.fpu.counter++; + new_fpu->counter++; __thread_set_has_fpu(new_fpu); - prefetch(new->thread.fpu.state); + prefetch(new_fpu->state); } else if (!use_eager_fpu()) stts(); } else { - old->thread.fpu.counter = 0; - old->thread.fpu.last_cpu = -1; + old_fpu->counter = 0; + old_fpu->last_cpu = -1; if (fpu.preload) { - new->thread.fpu.counter++; + new_fpu->counter++; if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else - prefetch(new->thread.fpu.state); + prefetch(new_fpu->state); __thread_fpu_begin(new_fpu); } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 1a0edce626b2..5b0ed71dde60 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -248,7 +248,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu = switch_fpu_prepare(prev_p, next_p, cpu); + fpu = switch_fpu_prepare(&prev_p->thread.fpu, &next_p->thread.fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 99cc4b8589ad..fefe65efd9d6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -278,7 +278,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unsigned fsindex, gsindex; fpu_switch_t fpu; - fpu = switch_fpu_prepare(prev_p, next_p, cpu); + fpu = switch_fpu_prepare(&prev_p->thread.fpu, &next_p->thread.fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). -- cgit v1.2.1 From 384a23f939912d368d2b42e1b41992be09aaf266 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:43:27 +0200 Subject: x86/fpu: Use 'struct fpu' in switch_fpu_finish() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 6 ++---- arch/x86/kernel/process_32.c | 10 ++++++---- arch/x86/kernel/process_64.c | 8 +++++--- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 60d2c6f376f3..5fd9b3f9be0f 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -451,11 +451,9 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) * state - all we need to do is to conditionally restore the register * state itself. */ -static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) +static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) { - struct fpu *new_fpu = &new->thread.fpu; - - if (fpu.preload) { + if (fpu_switch.preload) { if (unlikely(restore_fpu_checking(new_fpu))) fpu_reset_state(new_fpu); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5b0ed71dde60..7adc314b5075 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -241,14 +241,16 @@ __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; + *next = &next_p->thread; + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - fpu_switch_t fpu; + fpu_switch_t fpu_switch; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu = switch_fpu_prepare(&prev_p->thread.fpu, &next_p->thread.fpu, cpu); + fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -318,7 +320,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_p, fpu); + switch_fpu_finish(next_fpu, fpu_switch); this_cpu_write(current_task, next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index fefe65efd9d6..4504569c6c4e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -273,12 +273,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned fsindex, gsindex; - fpu_switch_t fpu; + fpu_switch_t fpu_switch; - fpu = switch_fpu_prepare(&prev_p->thread.fpu, &next_p->thread.fpu, cpu); + fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -390,7 +392,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - switch_fpu_finish(next_p, fpu); + switch_fpu_finish(next_fpu, fpu_switch); /* * Switch the PDA and FPU contexts. -- cgit v1.2.1 From 2d75bcf31470b15205f915aae725a284bc8f2da8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:49:29 +0200 Subject: x86/fpu: Move __save_fpu() into fpu/core.c This helper function is only used in fpu/core.c, move it there. This slightly speeds up compilation. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 11 ----------- arch/x86/kernel/fpu/core.c | 12 ++++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 5fd9b3f9be0f..6b84399c8839 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -501,17 +501,6 @@ static inline void user_fpu_begin(void) preempt_enable(); } -static inline void __save_fpu(struct task_struct *tsk) -{ - if (use_xsave()) { - if (unlikely(system_state == SYSTEM_BOOTING)) - xsave_state_booting(&tsk->thread.fpu.state->xsave); - else - xsave_state(&tsk->thread.fpu.state->xsave); - } else - fpu_fxsave(&tsk->thread.fpu); -} - /* * i387 state interaction */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 41c92897f574..9db4ef349c19 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -125,6 +125,18 @@ void __kernel_fpu_end(void) } EXPORT_SYMBOL(__kernel_fpu_end); +static void __save_fpu(struct task_struct *tsk) +{ + if (use_xsave()) { + if (unlikely(system_state == SYSTEM_BOOTING)) + xsave_state_booting(&tsk->thread.fpu.state->xsave); + else + xsave_state(&tsk->thread.fpu.state->xsave); + } else { + fpu_fxsave(&tsk->thread.fpu); + } +} + /* * Save the FPU state (initialize it if necessary): * -- cgit v1.2.1 From a4d8fc2e0652613426920aac429541127f8b26d8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:52:36 +0200 Subject: x86/fpu: Use 'struct fpu' in __fpu_save() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9db4ef349c19..7c0530082253 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -125,15 +125,15 @@ void __kernel_fpu_end(void) } EXPORT_SYMBOL(__kernel_fpu_end); -static void __save_fpu(struct task_struct *tsk) +static void __save_fpu(struct fpu *fpu) { if (use_xsave()) { if (unlikely(system_state == SYSTEM_BOOTING)) - xsave_state_booting(&tsk->thread.fpu.state->xsave); + xsave_state_booting(&fpu->state->xsave); else - xsave_state(&tsk->thread.fpu.state->xsave); + xsave_state(&fpu->state->xsave); } else { - fpu_fxsave(&tsk->thread.fpu); + fpu_fxsave(fpu); } } @@ -151,7 +151,7 @@ void fpu__save(struct task_struct *tsk) preempt_disable(); if (fpu->has_fpu) { if (use_eager_fpu()) { - __save_fpu(tsk); + __save_fpu(fpu); } else { fpu_save_init(fpu); __thread_fpu_end(fpu); @@ -231,17 +231,17 @@ EXPORT_SYMBOL_GPL(fpstate_free); */ static void fpu_copy(struct task_struct *dst, struct task_struct *src) { + struct fpu *dst_fpu = &dst->thread.fpu; + struct fpu *src_fpu = &src->thread.fpu; + WARN_ON(src != current); if (use_eager_fpu()) { memset(&dst->thread.fpu.state->xsave, 0, xstate_size); - __save_fpu(dst); + __save_fpu(dst_fpu); } else { - struct fpu *dfpu = &dst->thread.fpu; - struct fpu *sfpu = &src->thread.fpu; - fpu__save(src); - memcpy(dfpu->state, sfpu->state, xstate_size); + memcpy(dst_fpu->state, src_fpu->state, xstate_size); } } -- cgit v1.2.1 From 0c070595ceccb391100127a28ff837c50356ad67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2015 17:57:24 +0200 Subject: x86/fpu: Use 'struct fpu' in fpu__save() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 2 +- arch/x86/kernel/fpu/core.c | 8 +++----- arch/x86/kernel/traps.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index e69989f95da5..e3b42c5379bc 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -100,7 +100,7 @@ static inline int user_has_fpu(void) return current->thread.fpu.has_fpu; } -extern void fpu__save(struct task_struct *tsk); +extern void fpu__save(struct fpu *fpu); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7c0530082253..b685e9e90491 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -142,11 +142,9 @@ static void __save_fpu(struct fpu *fpu) * * This only ever gets called for the current task. */ -void fpu__save(struct task_struct *tsk) +void fpu__save(struct fpu *fpu) { - struct fpu *fpu = &tsk->thread.fpu; - - WARN_ON(tsk != current); + WARN_ON(fpu != ¤t->thread.fpu); preempt_disable(); if (fpu->has_fpu) { @@ -240,7 +238,7 @@ static void fpu_copy(struct task_struct *dst, struct task_struct *src) memset(&dst->thread.fpu.state->xsave, 0, xstate_size); __save_fpu(dst_fpu); } else { - fpu__save(src); + fpu__save(src_fpu); memcpy(dst_fpu->state, src_fpu->state, xstate_size); } } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 22ad90a40dbf..8abcd6a6f3dc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -730,7 +730,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) /* * Save the info for the exception handler and clear the error. */ - fpu__save(task); + fpu__save(&task->thread.fpu); task->thread.trap_nr = trapnr; task->thread.error_code = error_code; info.si_signo = SIGFPE; -- cgit v1.2.1 From f9bc977fe734772a7ca4a467fe4fd74e1ea3a849 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:07:33 +0200 Subject: x86/fpu: Use 'struct fpu' in fpu_copy() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b685e9e90491..9aaba6abfae3 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -227,15 +227,12 @@ EXPORT_SYMBOL_GPL(fpstate_free); * In the 'lazy' case we save to the source context, mark the FPU lazy * via stts() and copy the source context into the destination context. */ -static void fpu_copy(struct task_struct *dst, struct task_struct *src) +static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - struct fpu *dst_fpu = &dst->thread.fpu; - struct fpu *src_fpu = &src->thread.fpu; - - WARN_ON(src != current); + WARN_ON(src_fpu != ¤t->thread.fpu); if (use_eager_fpu()) { - memset(&dst->thread.fpu.state->xsave, 0, xstate_size); + memset(&dst_fpu->state->xsave, 0, xstate_size); __save_fpu(dst_fpu); } else { fpu__save(src_fpu); @@ -258,7 +255,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src) if (err) return err; - fpu_copy(dst, src); + fpu_copy(dst_fpu, src_fpu); } return 0; } -- cgit v1.2.1 From c69e098b1f90c0d520c4d5b5bff9f2ede95b13a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:07:15 +0200 Subject: x86/fpu: Use 'struct fpu' in fpu__copy() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 +- arch/x86/kernel/fpu/core.c | 13 +++++-------- arch/x86/kernel/process.c | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 6b84399c8839..21ad68179454 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -535,7 +535,7 @@ extern void fpstate_cache_init(void); extern int fpstate_alloc(struct fpu *fpu); extern void fpstate_free(struct fpu *fpu); -extern int fpu__copy(struct task_struct *dst, struct task_struct *src); +extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); static inline unsigned long alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9aaba6abfae3..a84358575235 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -240,15 +240,12 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) } } -int fpu__copy(struct task_struct *dst, struct task_struct *src) +int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - struct fpu *dst_fpu = &dst->thread.fpu; - struct fpu *src_fpu = &src->thread.fpu; - - dst->thread.fpu.counter = 0; - dst->thread.fpu.has_fpu = 0; - dst->thread.fpu.state = NULL; - dst->thread.fpu.last_cpu = -1; + dst_fpu->counter = 0; + dst_fpu->has_fpu = 0; + dst_fpu->state = NULL; + dst_fpu->last_cpu = -1; if (src_fpu->fpstate_active) { int err = fpstate_alloc(dst_fpu); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 50d503a2d8c3..e97266b18ad3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -83,7 +83,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; - return fpu__copy(dst, src); + return fpu__copy(&dst->thread.fpu, &src->thread.fpu); } void arch_release_task_struct(struct task_struct *tsk) -- cgit v1.2.1 From db2b1d3ad1cdae9f268d6db54b6127b09933da3d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:13:09 +0200 Subject: x86/fpu: Use 'struct fpu' in fpstate_alloc_init() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 2 +- arch/x86/kernel/fpu/core.c | 13 ++++++------- arch/x86/kernel/fpu/xsave.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index e3b42c5379bc..38376cdf297c 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -18,7 +18,7 @@ struct pt_regs; struct user_i387_struct; -extern int fpstate_alloc_init(struct task_struct *curr); +extern int fpstate_alloc_init(struct fpu *fpu); extern void fpstate_init(struct fpu *fpu); extern void fpu__flush_thread(struct task_struct *tsk); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a84358575235..183e69dfd4d0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -263,12 +263,11 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * * Can fail. */ -int fpstate_alloc_init(struct task_struct *curr) +int fpstate_alloc_init(struct fpu *fpu) { - struct fpu *fpu = &curr->thread.fpu; int ret; - if (WARN_ON_ONCE(curr != current)) + if (WARN_ON_ONCE(fpu != ¤t->thread.fpu)) return -EINVAL; if (WARN_ON_ONCE(fpu->fpstate_active)) return -EINVAL; @@ -276,11 +275,11 @@ int fpstate_alloc_init(struct task_struct *curr) /* * Memory allocation at the first usage of the FPU and other state. */ - ret = fpstate_alloc(&curr->thread.fpu); + ret = fpstate_alloc(fpu); if (ret) return ret; - fpstate_init(&curr->thread.fpu); + fpstate_init(fpu); /* Safe to do for the current task: */ fpu->fpstate_active = 1; @@ -360,7 +359,7 @@ void fpu__restore(void) /* * does a slab alloc which can sleep */ - if (fpstate_alloc_init(tsk)) { + if (fpstate_alloc_init(fpu)) { /* * ran out of memory! */ @@ -396,7 +395,7 @@ void fpu__flush_thread(struct task_struct *tsk) } else { if (!fpu->fpstate_active) { /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(fpstate_alloc_init(tsk))) + if (WARN_ON(fpstate_alloc_init(fpu))) force_sig(SIGKILL, tsk); user_fpu_begin(); } diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 3953cbf8d7e7..80b0c8fa50c5 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -350,7 +350,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - if (!fpu->fpstate_active && fpstate_alloc_init(tsk)) + if (!fpu->fpstate_active && fpstate_alloc_init(fpu)) return -1; if (!static_cpu_has(X86_FEATURE_FPU)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bab8afb61dc1..479d4ce25081 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6601,7 +6601,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - if (!fpu->fpstate_active && fpstate_alloc_init(current)) + if (!fpu->fpstate_active && fpstate_alloc_init(fpu)) return -ENOMEM; if (vcpu->sigset_active) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index f1aac55d6a67..e394bcb4275d 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -150,7 +150,7 @@ void math_emulate(struct math_emu_info *info) struct fpu *fpu = ¤t->thread.fpu; if (!fpu->fpstate_active) { - if (fpstate_alloc_init(current)) { + if (fpstate_alloc_init(fpu)) { do_group_exit(SIGKILL); return; } -- cgit v1.2.1 From cc08d5459905a4155cb77e5fe25f396b4c622b7d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:18:23 +0200 Subject: x86/fpu: Use 'struct fpu' in fpu__unlazy_stopped() Migrate this function to pure 'struct fpu' usage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 183e69dfd4d0..e3e8585284ad 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -311,27 +311,26 @@ EXPORT_SYMBOL_GPL(fpstate_alloc_init); * the read-only case, it's not strictly necessary for * read-only access to the context. */ -static int fpu__unlazy_stopped(struct task_struct *child) +static int fpu__unlazy_stopped(struct fpu *child_fpu) { - struct fpu *child_fpu = &child->thread.fpu; int ret; - if (WARN_ON_ONCE(child == current)) + if (WARN_ON_ONCE(child_fpu == ¤t->thread.fpu)) return -EINVAL; if (child_fpu->fpstate_active) { - child->thread.fpu.last_cpu = -1; + child_fpu->last_cpu = -1; return 0; } /* * Memory allocation at the first usage of the FPU and other state. */ - ret = fpstate_alloc(&child->thread.fpu); + ret = fpstate_alloc(child_fpu); if (ret) return ret; - fpstate_init(&child->thread.fpu); + fpstate_init(child_fpu); /* Safe to do for stopped child tasks: */ child_fpu->fpstate_active = 1; @@ -426,12 +425,13 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + struct fpu *fpu = &target->thread.fpu; int ret; if (!cpu_has_fxsr) return -ENODEV; - ret = fpu__unlazy_stopped(target); + ret = fpu__unlazy_stopped(fpu); if (ret) return ret; @@ -445,12 +445,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + struct fpu *fpu = &target->thread.fpu; int ret; if (!cpu_has_fxsr) return -ENODEV; - ret = fpu__unlazy_stopped(target); + ret = fpu__unlazy_stopped(fpu); if (ret) return ret; @@ -478,13 +479,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + struct fpu *fpu = &target->thread.fpu; struct xsave_struct *xsave; int ret; if (!cpu_has_xsave) return -ENODEV; - ret = fpu__unlazy_stopped(target); + ret = fpu__unlazy_stopped(fpu); if (ret) return ret; @@ -508,13 +510,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + struct fpu *fpu = &target->thread.fpu; struct xsave_struct *xsave; int ret; if (!cpu_has_xsave) return -ENODEV; - ret = fpu__unlazy_stopped(target); + ret = fpu__unlazy_stopped(fpu); if (ret) return ret; @@ -674,10 +677,11 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; int ret; - ret = fpu__unlazy_stopped(target); + ret = fpu__unlazy_stopped(fpu); if (ret) return ret; @@ -705,10 +709,11 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; int ret; - ret = fpu__unlazy_stopped(target); + ret = fpu__unlazy_stopped(fpu); if (ret) return ret; -- cgit v1.2.1 From 2e8a3102662233dfac92fe70f56429b4050f674a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:28:23 +0200 Subject: x86/fpu: Rename fpu__flush_thread() to fpu__clear() The primary purpose of this function is to clear the current task's FPU before an exec(), to not leak information from the previous task, and to allow the new task to start with freshly initialized FPU registers. Rename the function to reflect this primary purpose. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 2 +- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/process.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 38376cdf297c..b8f7d76ac066 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -20,7 +20,7 @@ struct user_i387_struct; extern int fpstate_alloc_init(struct fpu *fpu); extern void fpstate_init(struct fpu *fpu); -extern void fpu__flush_thread(struct task_struct *tsk); +extern void fpu__clear(struct task_struct *tsk); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void fpu__restore(void); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e3e8585284ad..c15d064ce43e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -381,11 +381,11 @@ void fpu__restore(void) } EXPORT_SYMBOL_GPL(fpu__restore); -void fpu__flush_thread(struct task_struct *tsk) +void fpu__clear(struct task_struct *tsk) { struct fpu *fpu = &tsk->thread.fpu; - WARN_ON(tsk != current); + WARN_ON_ONCE(tsk != current); /* Almost certainly an anomaly */ if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e97266b18ad3..51ad3422e728 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -130,7 +130,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - fpu__flush_thread(tsk); + fpu__clear(tsk); } static void hard_disable_TSC(void) -- cgit v1.2.1 From e11267c13fab2c8b0e5ed3f3ae8f6167f03f8953 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:34:05 +0200 Subject: x86/fpu: Clean up fpu__clear() a bit Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c15d064ce43e..176f69b24358 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -390,11 +390,11 @@ void fpu__clear(struct task_struct *tsk) if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ drop_fpu(fpu); - fpstate_free(&tsk->thread.fpu); + fpstate_free(fpu); } else { if (!fpu->fpstate_active) { /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(fpstate_alloc_init(fpu))) + if (WARN_ON(fpstate_alloc_init(fpu))) force_sig(SIGKILL, tsk); user_fpu_begin(); } -- cgit v1.2.1 From df6b35f409af0a8ff1ef62f552b8402f3fef8665 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:46:00 +0200 Subject: x86/fpu: Rename i387.h to fpu/api.h We already have fpu/types.h, move i387.h to fpu/api.h. The file name has become a misnomer anyway: it offers generic FPU APIs, but is not limited to i387 functionality. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/aesni-intel_glue.c | 2 +- arch/x86/crypto/crc32-pclmul_glue.c | 2 +- arch/x86/crypto/crct10dif-pclmul_glue.c | 2 +- arch/x86/crypto/fpu.c | 2 +- arch/x86/crypto/ghash-clmulni-intel_glue.c | 2 +- arch/x86/crypto/sha1_ssse3_glue.c | 2 +- arch/x86/crypto/sha256_ssse3_glue.c | 2 +- arch/x86/crypto/sha512_ssse3_glue.c | 2 +- arch/x86/crypto/twofish_avx_glue.c | 2 +- arch/x86/include/asm/crypto/glue_helper.h | 2 +- arch/x86/include/asm/efi.h | 2 +- arch/x86/include/asm/fpu-internal.h | 2 +- arch/x86/include/asm/fpu/api.h | 107 +++++++++++++++++++++++++++++ arch/x86/include/asm/i387.h | 107 ----------------------------- arch/x86/include/asm/simd.h | 2 +- arch/x86/include/asm/suspend_32.h | 2 +- arch/x86/include/asm/suspend_64.h | 2 +- arch/x86/include/asm/xor.h | 2 +- arch/x86/include/asm/xor_32.h | 2 +- arch/x86/include/asm/xor_avx.h | 2 +- arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kernel/fpu/xsave.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/lguest/boot.c | 2 +- arch/x86/lib/mmx_32.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- 26 files changed, 131 insertions(+), 131 deletions(-) create mode 100644 arch/x86/include/asm/fpu/api.h delete mode 100644 arch/x86/include/asm/i387.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 112cefacf2af..b419f43ce0c5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 1937fc1d8763..07d2c6c86a54 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -35,7 +35,7 @@ #include #include -#include +#include #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index b6c67bf30fdf..a3fcfc97a311 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index f368ba261739..5a2f30f9f52d 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include struct crypto_fpu_ctx { struct crypto_blkcipher *child; diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 2079baf06bdd..64d7cf1b50e1 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #define GHASH_BLOCK_SIZE 16 diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 33d1b9dc14cc..cb3bf19dca5a 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index ccc338881ee8..9eaf7abaf4dc 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index d9fa4c1e063f..e0d6a67f567d 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index b5e2d5651851..1a66e6110f4b 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 1eef55596e82..03bb1065c335 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -7,7 +7,7 @@ #include #include -#include +#include #include typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 3738b138b843..155162ea0e00 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H -#include +#include #include /* diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 21ad68179454..d68b349b4247 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -15,7 +15,7 @@ #include #include -#include +#include #include #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h new file mode 100644 index 000000000000..9d3a6f3cfc1b --- /dev/null +++ b/arch/x86/include/asm/fpu/api.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _ASM_X86_FPU_API_H +#define _ASM_X86_FPU_API_H + +#ifndef __ASSEMBLY__ + +#include +#include + +struct pt_regs; +struct user_i387_struct; + +extern int fpstate_alloc_init(struct fpu *fpu); +extern void fpstate_init(struct fpu *fpu); +extern void fpu__clear(struct task_struct *tsk); + +extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +extern void fpu__restore(void); +extern void fpu__init_check_bugs(void); + +extern bool irq_fpu_usable(void); + +/* + * Careful: __kernel_fpu_begin/end() must be called with preempt disabled + * and they don't touch the preempt state on their own. + * If you enable preemption after __kernel_fpu_begin(), preempt notifier + * should call the __kernel_fpu_end() to prevent the kernel/user FPU + * state from getting corrupted. KVM for example uses this model. + * + * All other cases use kernel_fpu_begin/end() which disable preemption + * during kernel FPU usage. + */ +extern void __kernel_fpu_begin(void); +extern void __kernel_fpu_end(void); + +static inline void kernel_fpu_begin(void) +{ + preempt_disable(); + WARN_ON_ONCE(!irq_fpu_usable()); + __kernel_fpu_begin(); +} + +static inline void kernel_fpu_end(void) +{ + __kernel_fpu_end(); + preempt_enable(); +} + +/* + * Some instructions like VIA's padlock instructions generate a spurious + * DNA fault but don't modify SSE registers. And these instructions + * get used from interrupt context as well. To prevent these kernel instructions + * in interrupt context interacting wrongly with other user/kernel fpu usage, we + * should use them only in the context of irq_ts_save/restore() + */ +static inline int irq_ts_save(void) +{ + /* + * If in process context and not atomic, we can take a spurious DNA fault. + * Otherwise, doing clts() in process context requires disabling preemption + * or some heavy lifting like kernel_fpu_begin() + */ + if (!in_atomic()) + return 0; + + if (read_cr0() & X86_CR0_TS) { + clts(); + return 1; + } + + return 0; +} + +static inline void irq_ts_restore(int TS_state) +{ + if (TS_state) + stts(); +} + +/* + * The question "does this thread have fpu access?" + * is slightly racy, since preemption could come in + * and revoke it immediately after the test. + * + * However, even in that very unlikely scenario, + * we can just assume we have FPU access - typically + * to save the FP state - we'll just take a #NM + * fault and get the FPU access back. + */ +static inline int user_has_fpu(void) +{ + return current->thread.fpu.has_fpu; +} + +extern void fpu__save(struct fpu *fpu); + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h deleted file mode 100644 index b8f7d76ac066..000000000000 --- a/arch/x86/include/asm/i387.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes , May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _ASM_X86_I387_H -#define _ASM_X86_I387_H - -#ifndef __ASSEMBLY__ - -#include -#include - -struct pt_regs; -struct user_i387_struct; - -extern int fpstate_alloc_init(struct fpu *fpu); -extern void fpstate_init(struct fpu *fpu); -extern void fpu__clear(struct task_struct *tsk); - -extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); -extern void fpu__restore(void); -extern void fpu__init_check_bugs(void); - -extern bool irq_fpu_usable(void); - -/* - * Careful: __kernel_fpu_begin/end() must be called with preempt disabled - * and they don't touch the preempt state on their own. - * If you enable preemption after __kernel_fpu_begin(), preempt notifier - * should call the __kernel_fpu_end() to prevent the kernel/user FPU - * state from getting corrupted. KVM for example uses this model. - * - * All other cases use kernel_fpu_begin/end() which disable preemption - * during kernel FPU usage. - */ -extern void __kernel_fpu_begin(void); -extern void __kernel_fpu_end(void); - -static inline void kernel_fpu_begin(void) -{ - preempt_disable(); - WARN_ON_ONCE(!irq_fpu_usable()); - __kernel_fpu_begin(); -} - -static inline void kernel_fpu_end(void) -{ - __kernel_fpu_end(); - preempt_enable(); -} - -/* - * Some instructions like VIA's padlock instructions generate a spurious - * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context as well. To prevent these kernel instructions - * in interrupt context interacting wrongly with other user/kernel fpu usage, we - * should use them only in the context of irq_ts_save/restore() - */ -static inline int irq_ts_save(void) -{ - /* - * If in process context and not atomic, we can take a spurious DNA fault. - * Otherwise, doing clts() in process context requires disabling preemption - * or some heavy lifting like kernel_fpu_begin() - */ - if (!in_atomic()) - return 0; - - if (read_cr0() & X86_CR0_TS) { - clts(); - return 1; - } - - return 0; -} - -static inline void irq_ts_restore(int TS_state) -{ - if (TS_state) - stts(); -} - -/* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int user_has_fpu(void) -{ - return current->thread.fpu.has_fpu; -} - -extern void fpu__save(struct fpu *fpu); - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h index ee80b92f0096..6c8a7ed13365 100644 --- a/arch/x86/include/asm/simd.h +++ b/arch/x86/include/asm/simd.h @@ -1,5 +1,5 @@ -#include +#include /* * may_use_simd - whether it is allowable at this time to issue SIMD diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 552d6c90a6d4..d1793f06854d 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -7,7 +7,7 @@ #define _ASM_X86_SUSPEND_32_H #include -#include +#include /* image of the saved processor state */ struct saved_context { diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index bc6232834bab..7ebf0ebe4e68 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -7,7 +7,7 @@ #define _ASM_X86_SUSPEND_64_H #include -#include +#include /* * Image of the saved processor state, used by the low level ACPI suspend to diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index d8829751b3f8..1f5c5161ead6 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -36,7 +36,7 @@ * no advantages to be gotten from x86-64 here anyways. */ -#include +#include #ifdef CONFIG_X86_32 /* reduce register pressure */ diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index ce05722e3c68..5a08bc8bff33 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -26,7 +26,7 @@ #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" -#include +#include static void xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 492b29802f57..7c0a517ec751 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -18,7 +18,7 @@ #ifdef CONFIG_AS_AVX #include -#include +#include #define BLOCK4(i) \ BLOCK(32 * i, 0) \ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index eb8be0c5823b..29dd74318ec6 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 80b0c8fa50c5..8aa3b864a2e0 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7b61687bd79..5cb738a18ca3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 8f9a133cc099..27f8eea0d6eb 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #include #include /* for struct machine_ops */ #include diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index c9f2d9ba8dd8..e5e3ed8dc079 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include void *_mmx_memcpy(void *to, const void *from, size_t len) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index e394bcb4275d..3bb4c6a24ea5 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include "fpu_system.h" #include "fpu_emu.h" -- cgit v1.2.1 From a137fb6bbf4f10b8ef1452e9b190d8bc76c04d0f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 03:58:37 +0200 Subject: x86/fpu: Move xsave.h to fpu/xsave.h Move the xsave.h header file to the FPU directory as well. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 2 +- arch/x86/crypto/camellia_aesni_avx_glue.c | 2 +- arch/x86/crypto/cast5_avx_glue.c | 2 +- arch/x86/crypto/cast6_avx_glue.c | 2 +- arch/x86/crypto/serpent_avx2_glue.c | 2 +- arch/x86/crypto/serpent_avx_glue.c | 2 +- arch/x86/crypto/sha-mb/sha1_mb.c | 2 +- arch/x86/crypto/sha1_ssse3_glue.c | 2 +- arch/x86/crypto/sha256_ssse3_glue.c | 2 +- arch/x86/crypto/sha512_ssse3_glue.c | 2 +- arch/x86/crypto/twofish_avx_glue.c | 2 +- arch/x86/include/asm/fpu-internal.h | 2 +- arch/x86/include/asm/fpu/xsave.h | 250 +++++++++++++++++++++++++++++ arch/x86/include/asm/xsave.h | 250 ----------------------------- arch/x86/kvm/cpuid.c | 2 +- 15 files changed, 263 insertions(+), 263 deletions(-) create mode 100644 arch/x86/include/asm/fpu/xsave.h delete mode 100644 arch/x86/include/asm/xsave.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index baf0ac21ace5..004acd7bb4e0 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 78818a1e73e3..2f7ead8caf53 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 236c80974457..2c3360be6fc8 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #define CAST5_PARALLEL_BLOCKS 16 diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index f448810ca4ac..a2ec18a56e4f 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #define CAST6_PARALLEL_BLOCKS 8 diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 2f63dc89e7a9..206ec57725a3 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index c8d478af8456..4feb68c9a41f 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 15373786494f..02b64bbc1d48 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -66,7 +66,7 @@ #include #include #include -#include +#include #include #include #include "sha_mb_ctx.h" diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index cb3bf19dca5a..71ab2b35d5e0 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 9eaf7abaf4dc..dcbd8ea6eaaf 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index e0d6a67f567d..e8836e0c1098 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 1a66e6110f4b..3b6c8ba64f81 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index d68b349b4247..20690a14c73a 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -16,7 +16,7 @@ #include #include -#include +#include #ifdef CONFIG_X86_64 # include diff --git a/arch/x86/include/asm/fpu/xsave.h b/arch/x86/include/asm/fpu/xsave.h new file mode 100644 index 000000000000..7c90ea93c54e --- /dev/null +++ b/arch/x86/include/asm/fpu/xsave.h @@ -0,0 +1,250 @@ +#ifndef __ASM_X86_XSAVE_H +#define __ASM_X86_XSAVE_H + +#include +#include + +#define XSTATE_CPUID 0x0000000d + +#define XSTATE_FP 0x1 +#define XSTATE_SSE 0x2 +#define XSTATE_YMM 0x4 +#define XSTATE_BNDREGS 0x8 +#define XSTATE_BNDCSR 0x10 +#define XSTATE_OPMASK 0x20 +#define XSTATE_ZMM_Hi256 0x40 +#define XSTATE_Hi16_ZMM 0x80 + +#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) +/* Bit 63 of XCR0 is reserved for future expansion */ +#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) + +#define FXSAVE_SIZE 512 + +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) + +/* Supported features which support lazy state saving */ +#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) + +/* Supported features which require eager state saving */ +#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) + +/* All currently supported features */ +#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +extern unsigned int xstate_size; +extern u64 pcntxt_mask; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; +extern struct xsave_struct *init_xstate_buf; + +extern void xsave_init(void); +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +#define xstate_fault ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err) + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int xsave_state_booting(struct xsave_struct *fx) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XSAVES"\n\t" + "2:\n\t" + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XSAVE"\n\t" + "2:\n\t" + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + return err; +} + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XRSTORS"\n\t" + "2:\n\t" + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XRSTOR"\n\t" + "2:\n\t" + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + return err; +} + +/* + * Save processor xstate to xsave area. + */ +static inline int xsave_state(struct xsave_struct *fx) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + /* + * If xsaves is enabled, xsaves replaces xsaveopt because + * it supports compact format and supervisor states in addition to + * modified optimization in xsaveopt. + * + * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave + * because xsaveopt supports modified optimization which is not + * supported by xsave. + * + * If none of xsaves and xsaveopt is enabled, use xsave. + */ + alternative_input_2( + "1:"XSAVE, + XSAVEOPT, + X86_FEATURE_XSAVEOPT, + XSAVES, + X86_FEATURE_XSAVES, + [fx] "D" (fx), "a" (lmask), "d" (hmask) : + "memory"); + asm volatile("2:\n\t" + xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * Restore processor xstate from xsave area. + */ +static inline int xrstor_state(struct xsave_struct *fx, u64 mask) +{ + int err = 0; + u32 lmask = mask; + u32 hmask = mask >> 32; + + /* + * Use xrstors to restore context if it is enabled. xrstors supports + * compacted format of xsave area which is not supported by xrstor. + */ + alternative_input( + "1: " XRSTOR, + XRSTORS, + X86_FEATURE_XSAVES, + "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile("2:\n" + xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * Restore xstate context for new process during context switch. + */ +static inline int fpu_xrstor_checking(struct xsave_struct *fx) +{ + return xrstor_state(fx, -1); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for + * backward compatibility for old applications which don't understand + * compacted format of xsave area. + */ +static inline int xsave_user(struct xsave_struct __user *buf) +{ + int err; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr)); + if (unlikely(err)) + return -EFAULT; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XSAVE"\n" + "2: " ASM_CLAC "\n" + xstate_fault + : "D" (buf), "a" (-1), "d" (-1), "0" (0) + : "memory"); + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) +{ + int err = 0; + struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XRSTOR"\n" + "2: " ASM_CLAC "\n" + xstate_fault + : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) + : "memory"); /* memory required? */ + return err; +} + +void *get_xsave_addr(struct xsave_struct *xsave, int xstate); +void setup_xstate_comp(void); + +#endif diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h deleted file mode 100644 index 7c90ea93c54e..000000000000 --- a/arch/x86/include/asm/xsave.h +++ /dev/null @@ -1,250 +0,0 @@ -#ifndef __ASM_X86_XSAVE_H -#define __ASM_X86_XSAVE_H - -#include -#include - -#define XSTATE_CPUID 0x0000000d - -#define XSTATE_FP 0x1 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 -#define XSTATE_BNDREGS 0x8 -#define XSTATE_BNDCSR 0x10 -#define XSTATE_OPMASK 0x20 -#define XSTATE_ZMM_Hi256 0x40 -#define XSTATE_Hi16_ZMM 0x80 - -#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) -#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) -/* Bit 63 of XCR0 is reserved for future expansion */ -#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) - -#define FXSAVE_SIZE 512 - -#define XSAVE_HDR_SIZE 64 -#define XSAVE_HDR_OFFSET FXSAVE_SIZE - -#define XSAVE_YMM_SIZE 256 -#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) - -/* Supported features which support lazy state saving */ -#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) - -/* Supported features which require eager state saving */ -#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) - -/* All currently supported features */ -#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) - -#ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " -#else -#define REX_PREFIX -#endif - -extern unsigned int xstate_size; -extern u64 pcntxt_mask; -extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern struct xsave_struct *init_xstate_buf; - -extern void xsave_init(void); -extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); - -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" - -#define xstate_fault ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err) - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int xsave_state_booting(struct xsave_struct *fx) -{ - u64 mask = -1; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XSAVES"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XSAVE"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XRSTORS"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XRSTOR"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * Save processor xstate to xsave area. - */ -static inline int xsave_state(struct xsave_struct *fx) -{ - u64 mask = -1; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - /* - * If xsaves is enabled, xsaves replaces xsaveopt because - * it supports compact format and supervisor states in addition to - * modified optimization in xsaveopt. - * - * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave - * because xsaveopt supports modified optimization which is not - * supported by xsave. - * - * If none of xsaves and xsaveopt is enabled, use xsave. - */ - alternative_input_2( - "1:"XSAVE, - XSAVEOPT, - X86_FEATURE_XSAVEOPT, - XSAVES, - X86_FEATURE_XSAVES, - [fx] "D" (fx), "a" (lmask), "d" (hmask) : - "memory"); - asm volatile("2:\n\t" - xstate_fault - : "0" (0) - : "memory"); - - return err; -} - -/* - * Restore processor xstate from xsave area. - */ -static inline int xrstor_state(struct xsave_struct *fx, u64 mask) -{ - int err = 0; - u32 lmask = mask; - u32 hmask = mask >> 32; - - /* - * Use xrstors to restore context if it is enabled. xrstors supports - * compacted format of xsave area which is not supported by xrstor. - */ - alternative_input( - "1: " XRSTOR, - XRSTORS, - X86_FEATURE_XSAVES, - "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - - asm volatile("2:\n" - xstate_fault - : "0" (0) - : "memory"); - - return err; -} - -/* - * Restore xstate context for new process during context switch. - */ -static inline int fpu_xrstor_checking(struct xsave_struct *fx) -{ - return xrstor_state(fx, -1); -} - -/* - * Save xstate to user space xsave area. - * - * We don't use modified optimization because xrstor/xrstors might track - * a different application. - * - * We don't use compacted format xsave area for - * backward compatibility for old applications which don't understand - * compacted format of xsave area. - */ -static inline int xsave_user(struct xsave_struct __user *buf) -{ - int err; - - /* - * Clear the xsave header first, so that reserved fields are - * initialized to zero. - */ - err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr)); - if (unlikely(err)) - return -EFAULT; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XSAVE"\n" - "2: " ASM_CLAC "\n" - xstate_fault - : "D" (buf), "a" (-1), "d" (-1), "0" (0) - : "memory"); - return err; -} - -/* - * Restore xstate from user space xsave area. - */ -static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) -{ - int err = 0; - struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); - u32 lmask = mask; - u32 hmask = mask >> 32; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XRSTOR"\n" - "2: " ASM_CLAC "\n" - xstate_fault - : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) - : "memory"); /* memory required? */ - return err; -} - -void *get_xsave_addr(struct xsave_struct *xsave, int xstate); -void setup_xstate_comp(void); - -#endif diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 59b69f6a2844..0ce4c4f87332 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" -- cgit v1.2.1 From 78f7f1e54bac032b98956862a5bcf8c28ed22d07 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:54:44 +0200 Subject: x86/fpu: Rename fpu-internal.h to fpu/internal.h This unifies all the FPU related header files under a unified, hiearchical naming scheme: - asm/fpu/types.h: FPU related data types, needed for 'struct task_struct', widely included in almost all kernel code, and hence kept as small as possible. - asm/fpu/api.h: FPU related 'public' methods exported to other subsystems. - asm/fpu/internal.h: FPU subsystem internal methods - asm/fpu/xsave.h: XSAVE support internal methods (Also standardize the header guard in asm/fpu/internal.h.) Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/crc32c-intel_glue.c | 2 +- arch/x86/crypto/sha-mb/sha1_mb.c | 2 +- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu-internal.h | 556 ------------------------------------ arch/x86/include/asm/fpu/internal.h | 556 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/init.c | 2 +- arch/x86/kernel/fpu/xsave.c | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/signal.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/mm/mpx.c | 2 +- arch/x86/power/cpu.c | 2 +- 19 files changed, 573 insertions(+), 573 deletions(-) delete mode 100644 arch/x86/include/asm/fpu-internal.h create mode 100644 arch/x86/include/asm/fpu/internal.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 470522cb042a..81a595d75cf5 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -32,7 +32,7 @@ #include #include -#include +#include #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 02b64bbc1d48..03ffaf8c2244 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -68,7 +68,7 @@ #include #include #include -#include +#include #include "sha_mb_ctx.h" #define FLUSH_INTERVAL 1000 /* in usec */ diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e1ec6f90d09e..d6d8f4ca5136 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h deleted file mode 100644 index 20690a14c73a..000000000000 --- a/arch/x86/include/asm/fpu-internal.h +++ /dev/null @@ -1,556 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes , May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _FPU_INTERNAL_H -#define _FPU_INTERNAL_H - -#include -#include -#include - -#include -#include -#include - -#ifdef CONFIG_X86_64 -# include -# include -struct ksignal; -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -#else -# define user_i387_ia32_struct user_i387_struct -# define user32_fxsr_struct user_fxsr_struct -# define ia32_setup_frame __setup_frame -# define ia32_setup_rt_frame __setup_rt_frame -#endif - -extern unsigned int mxcsr_feature_mask; -extern void fpu__cpu_init(void); -extern void eager_fpu_init(void); - -DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); - -extern void convert_from_fxsr(struct user_i387_ia32_struct *env, - struct task_struct *tsk); -extern void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env); - -extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; - -/* - * xstateregs_active == fpregs_active. Please refer to the comment - * at the definition of fpregs_active. - */ -#define xstateregs_active fpregs_active - -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - -/* - * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, - * on this CPU. - * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. - */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) -{ - per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; -} - -static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) -{ - return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; -} - -static inline int is_ia32_compat_frame(void) -{ - return config_enabled(CONFIG_IA32_EMULATION) && - test_thread_flag(TIF_IA32); -} - -static inline int is_ia32_frame(void) -{ - return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); -} - -static inline int is_x32_frame(void) -{ - return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); -} - -#define X87_FSW_ES (1 << 7) /* Exception Summary */ - -static __always_inline __pure bool use_eager_fpu(void) -{ - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); -} - -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has_safe(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has_safe(X86_FEATURE_FXSR); -} - -static inline void fx_finit(struct i387_fxsave_struct *fx) -{ - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; -} - -extern void __sanitize_i387_state(struct task_struct *); - -static inline void sanitize_i387_state(struct task_struct *tsk) -{ - if (!use_xsaveopt()) - return; - __sanitize_i387_state(tsk); -} - -#define user_insn(insn, output, input...) \ -({ \ - int err; \ - asm volatile(ASM_STAC "\n" \ - "1:" #insn "\n\t" \ - "2: " ASM_CLAC "\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define check_insn(insn, output, input...) \ -({ \ - int err; \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -static inline int fsave_user(struct i387_fsave_struct __user *fx) -{ - return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); -} - -static inline int fxsave_user(struct i387_fxsave_struct __user *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); -} - -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), - "m" (*fx)); -} - -static inline int fxrstor_user(struct i387_fxsave_struct __user *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), - "m" (*fx)); -} - -static inline int frstor_checking(struct i387_fsave_struct *fx) -{ - return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_user(struct i387_fsave_struct __user *fx) -{ - return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - if (config_enabled(CONFIG_X86_32)) - asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave)); - else { - /* Using "rex64; fxsave %0" is broken because, if the memory - * operand uses any extended registers for addressing, a second - * REX prefix will be generated (to the assembler, rex64 - * followed by semicolon is a separate instruction), and hence - * the 64-bitness is lost. - * - * Using "fxsaveq %0" would be the ideal choice, but is only - * supported starting with gas 2.16. - * - * Using, as a workaround, the properly prefixed form below - * isn't accepted by any binutils version so far released, - * complaining that the same type of prefix is used twice if - * an extended register is needed for addressing (fix submitted - * to mainline 2005-11-21). - * - * asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave)); - * - * This, however, we can work around by forcing the compiler to - * select an addressing mode that doesn't require extended - * registers. - */ - asm volatile( "rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); - } -} - -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact. - */ -static inline int fpu_save_init(struct fpu *fpu) -{ - if (use_xsave()) { - xsave_state(&fpu->state->xsave); - - /* - * xsave header may indicate the init state of the FP. - */ - if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return 1; - } else if (use_fxsr()) { - fpu_fxsave(fpu); - } else { - asm volatile("fnsave %[fx]; fwait" - : [fx] "=m" (fpu->state->fsave)); - return 0; - } - - /* - * If exceptions are pending, we need to clear them so - * that we don't randomly get exceptions later. - * - * FIXME! Is this perhaps only true for the old-style - * irq13 case? Maybe we could leave the x87 state - * intact otherwise? - */ - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { - asm volatile("fnclex"); - return 0; - } - return 1; -} - -static inline int fpu_restore_checking(struct fpu *fpu) -{ - if (use_xsave()) - return fpu_xrstor_checking(&fpu->state->xsave); - else if (use_fxsr()) - return fxrstor_checking(&fpu->state->fxsave); - else - return frstor_checking(&fpu->state->fsave); -} - -static inline int restore_fpu_checking(struct fpu *fpu) -{ - /* - * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is - * pending. Clear the x87 state here by setting it to fixed values. - * "m" is a random variable that should be in L1. - */ - if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { - asm volatile( - "fnclex\n\t" - "emms\n\t" - "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (fpu->has_fpu)); - } - - return fpu_restore_checking(fpu); -} - -/* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct fpu *fpu) -{ - fpu->has_fpu = 0; - this_cpu_write(fpu_fpregs_owner_ctx, NULL); -} - -/* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct fpu *fpu) -{ - fpu->has_fpu = 1; - this_cpu_write(fpu_fpregs_owner_ctx, fpu); -} - -/* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void __thread_fpu_end(struct fpu *fpu) -{ - __thread_clear_has_fpu(fpu); - if (!use_eager_fpu()) - stts(); -} - -static inline void __thread_fpu_begin(struct fpu *fpu) -{ - if (!use_eager_fpu()) - clts(); - __thread_set_has_fpu(fpu); -} - -static inline void drop_fpu(struct fpu *fpu) -{ - /* - * Forget coprocessor state.. - */ - preempt_disable(); - fpu->counter = 0; - - if (fpu->has_fpu) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(fpu); - } - - fpu->fpstate_active = 0; - - preempt_enable(); -} - -static inline void restore_init_xstate(void) -{ - if (use_xsave()) - xrstor_state(init_xstate_buf, -1); - else - fxrstor_checking(&init_xstate_buf->i387); -} - -/* - * Reset the FPU state in the eager case and drop it in the lazy case (later use - * will reinit it). - */ -static inline void fpu_reset_state(struct fpu *fpu) -{ - if (!use_eager_fpu()) - drop_fpu(fpu); - else - restore_init_xstate(); -} - -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. - * - * - switch_fpu_finish() restores the new state as - * necessary. - */ -typedef struct { int preload; } fpu_switch_t; - -static inline fpu_switch_t -switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) -{ - fpu_switch_t fpu; - - /* - * If the task has used the math, pre-load the FPU on xsave processors - * or if the past 5 consecutive context-switches used math. - */ - fpu.preload = new_fpu->fpstate_active && - (use_eager_fpu() || new_fpu->counter > 5); - - if (old_fpu->has_fpu) { - if (!fpu_save_init(old_fpu)) - old_fpu->last_cpu = -1; - else - old_fpu->last_cpu = cpu; - - /* But leave fpu_fpregs_owner_ctx! */ - old_fpu->has_fpu = 0; - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new_fpu->counter++; - __thread_set_has_fpu(new_fpu); - prefetch(new_fpu->state); - } else if (!use_eager_fpu()) - stts(); - } else { - old_fpu->counter = 0; - old_fpu->last_cpu = -1; - if (fpu.preload) { - new_fpu->counter++; - if (fpu_want_lazy_restore(new_fpu, cpu)) - fpu.preload = 0; - else - prefetch(new_fpu->state); - __thread_fpu_begin(new_fpu); - } - } - return fpu; -} - -/* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. - */ -static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) -{ - if (fpu_switch.preload) { - if (unlikely(restore_fpu_checking(new_fpu))) - fpu_reset_state(new_fpu); - } -} - -/* - * Signal frame handlers... - */ -extern int save_xstate_sig(void __user *buf, void __user *fx, int size); -extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); - -static inline int xstate_sigframe_size(void) -{ - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; -} - -static inline int restore_xstate_sig(void __user *buf, int ia32_frame) -{ - void __user *buf_fx = buf; - int size = xstate_sigframe_size(); - - if (ia32_frame && use_fxsr()) { - buf_fx = buf + sizeof(struct i387_fsave_struct); - size += sizeof(struct i387_fsave_struct); - } - - return __restore_xstate_sig(buf, buf_fx, size); -} - -/* - * Needs to be preemption-safe. - * - * NOTE! user_fpu_begin() must be used only immediately before restoring - * the save state. It does not do any saving/restoring on its own. In - * lazy FPU mode, it is just an optimization to avoid a #NM exception, - * the task can lose the FPU right after preempt_enable(). - */ -static inline void user_fpu_begin(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - preempt_disable(); - if (!user_has_fpu()) - __thread_fpu_begin(fpu); - preempt_enable(); -} - -/* - * i387 state interaction - */ -static inline unsigned short get_fpu_cwd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.cwd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.swd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) -{ - if (cpu_has_xmm) { - return tsk->thread.fpu.state->fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - -extern void fpstate_cache_init(void); - -extern int fpstate_alloc(struct fpu *fpu); -extern void fpstate_free(struct fpu *fpu); -extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); - -static inline unsigned long -alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, - unsigned long *size) -{ - unsigned long frame_size = xstate_sigframe_size(); - - *buf_fx = sp = round_down(sp - frame_size, 64); - if (ia32_frame && use_fxsr()) { - frame_size += sizeof(struct i387_fsave_struct); - sp -= sizeof(struct i387_fsave_struct); - } - - *size = frame_size; - return sp; -} - -#endif diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h new file mode 100644 index 000000000000..386a8837c358 --- /dev/null +++ b/arch/x86/include/asm/fpu/internal.h @@ -0,0 +1,556 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _ASM_X86_FPU_INTERNAL_H +#define _ASM_X86_FPU_INTERNAL_H + +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_X86_64 +# include +# include +struct ksignal; +int ia32_setup_rt_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs); +#else +# define user_i387_ia32_struct user_i387_struct +# define user32_fxsr_struct user_fxsr_struct +# define ia32_setup_frame __setup_frame +# define ia32_setup_rt_frame __setup_rt_frame +#endif + +extern unsigned int mxcsr_feature_mask; +extern void fpu__cpu_init(void); +extern void eager_fpu_init(void); + +DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +extern void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk); +extern void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env); + +extern user_regset_active_fn fpregs_active, xfpregs_active; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active + +#ifdef CONFIG_MATH_EMULATION +extern void finit_soft_fpu(struct i387_soft_struct *soft); +#else +static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} +#endif + +/* + * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, + * on this CPU. + * + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. + */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; +} + +static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + +static inline int is_ia32_compat_frame(void) +{ + return config_enabled(CONFIG_IA32_EMULATION) && + test_thread_flag(TIF_IA32); +} + +static inline int is_ia32_frame(void) +{ + return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); +} + +static inline int is_x32_frame(void) +{ + return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); +} + +#define X87_FSW_ES (1 << 7) /* Exception Summary */ + +static __always_inline __pure bool use_eager_fpu(void) +{ + return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); +} + +static __always_inline __pure bool use_xsaveopt(void) +{ + return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); +} + +static __always_inline __pure bool use_xsave(void) +{ + return static_cpu_has_safe(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return static_cpu_has_safe(X86_FEATURE_FXSR); +} + +static inline void fx_finit(struct i387_fxsave_struct *fx) +{ + fx->cwd = 0x37f; + fx->mxcsr = MXCSR_DEFAULT; +} + +extern void __sanitize_i387_state(struct task_struct *); + +static inline void sanitize_i387_state(struct task_struct *tsk) +{ + if (!use_xsaveopt()) + return; + __sanitize_i387_state(tsk); +} + +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile(ASM_STAC "\n" \ + "1:" #insn "\n\t" \ + "2: " ASM_CLAC "\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define check_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +static inline int fsave_user(struct i387_fsave_struct __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int fxsave_user(struct i387_fxsave_struct __user *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + + /* See comment in fpu_fxsave() below. */ + return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); +} + +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + + /* See comment in fpu_fxsave() below. */ + return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); +} + +static inline int fxrstor_user(struct i387_fxsave_struct __user *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + + /* See comment in fpu_fxsave() below. */ + return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); +} + +static inline int frstor_checking(struct i387_fsave_struct *fx) +{ + return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_user(struct i387_fsave_struct __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void fpu_fxsave(struct fpu *fpu) +{ + if (config_enabled(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave)); + else { + /* Using "rex64; fxsave %0" is broken because, if the memory + * operand uses any extended registers for addressing, a second + * REX prefix will be generated (to the assembler, rex64 + * followed by semicolon is a separate instruction), and hence + * the 64-bitness is lost. + * + * Using "fxsaveq %0" would be the ideal choice, but is only + * supported starting with gas 2.16. + * + * Using, as a workaround, the properly prefixed form below + * isn't accepted by any binutils version so far released, + * complaining that the same type of prefix is used twice if + * an extended register is needed for addressing (fix submitted + * to mainline 2005-11-21). + * + * asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave)); + * + * This, however, we can work around by forcing the compiler to + * select an addressing mode that doesn't require extended + * registers. + */ + asm volatile( "rex64/fxsave (%[fx])" + : "=m" (fpu->state->fxsave) + : [fx] "R" (&fpu->state->fxsave)); + } +} + +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact. + */ +static inline int fpu_save_init(struct fpu *fpu) +{ + if (use_xsave()) { + xsave_state(&fpu->state->xsave); + + /* + * xsave header may indicate the init state of the FP. + */ + if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) + return 1; + } else if (use_fxsr()) { + fpu_fxsave(fpu); + } else { + asm volatile("fnsave %[fx]; fwait" + : [fx] "=m" (fpu->state->fsave)); + return 0; + } + + /* + * If exceptions are pending, we need to clear them so + * that we don't randomly get exceptions later. + * + * FIXME! Is this perhaps only true for the old-style + * irq13 case? Maybe we could leave the x87 state + * intact otherwise? + */ + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { + asm volatile("fnclex"); + return 0; + } + return 1; +} + +static inline int fpu_restore_checking(struct fpu *fpu) +{ + if (use_xsave()) + return fpu_xrstor_checking(&fpu->state->xsave); + else if (use_fxsr()) + return fxrstor_checking(&fpu->state->fxsave); + else + return frstor_checking(&fpu->state->fsave); +} + +static inline int restore_fpu_checking(struct fpu *fpu) +{ + /* + * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is + * pending. Clear the x87 state here by setting it to fixed values. + * "m" is a random variable that should be in L1. + */ + if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %P[addr]" /* set F?P to defined value */ + : : [addr] "m" (fpu->has_fpu)); + } + + return fpu_restore_checking(fpu); +} + +/* Must be paired with an 'stts' after! */ +static inline void __thread_clear_has_fpu(struct fpu *fpu) +{ + fpu->has_fpu = 0; + this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +/* Must be paired with a 'clts' before! */ +static inline void __thread_set_has_fpu(struct fpu *fpu) +{ + fpu->has_fpu = 1; + this_cpu_write(fpu_fpregs_owner_ctx, fpu); +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void __thread_fpu_end(struct fpu *fpu) +{ + __thread_clear_has_fpu(fpu); + if (!use_eager_fpu()) + stts(); +} + +static inline void __thread_fpu_begin(struct fpu *fpu) +{ + if (!use_eager_fpu()) + clts(); + __thread_set_has_fpu(fpu); +} + +static inline void drop_fpu(struct fpu *fpu) +{ + /* + * Forget coprocessor state.. + */ + preempt_disable(); + fpu->counter = 0; + + if (fpu->has_fpu) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + __thread_fpu_end(fpu); + } + + fpu->fpstate_active = 0; + + preempt_enable(); +} + +static inline void restore_init_xstate(void) +{ + if (use_xsave()) + xrstor_state(init_xstate_buf, -1); + else + fxrstor_checking(&init_xstate_buf->i387); +} + +/* + * Reset the FPU state in the eager case and drop it in the lazy case (later use + * will reinit it). + */ +static inline void fpu_reset_state(struct fpu *fpu) +{ + if (!use_eager_fpu()) + drop_fpu(fpu); + else + restore_init_xstate(); +} + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. + */ +typedef struct { int preload; } fpu_switch_t; + +static inline fpu_switch_t +switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) +{ + fpu_switch_t fpu; + + /* + * If the task has used the math, pre-load the FPU on xsave processors + * or if the past 5 consecutive context-switches used math. + */ + fpu.preload = new_fpu->fpstate_active && + (use_eager_fpu() || new_fpu->counter > 5); + + if (old_fpu->has_fpu) { + if (!fpu_save_init(old_fpu)) + old_fpu->last_cpu = -1; + else + old_fpu->last_cpu = cpu; + + /* But leave fpu_fpregs_owner_ctx! */ + old_fpu->has_fpu = 0; + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + new_fpu->counter++; + __thread_set_has_fpu(new_fpu); + prefetch(new_fpu->state); + } else if (!use_eager_fpu()) + stts(); + } else { + old_fpu->counter = 0; + old_fpu->last_cpu = -1; + if (fpu.preload) { + new_fpu->counter++; + if (fpu_want_lazy_restore(new_fpu, cpu)) + fpu.preload = 0; + else + prefetch(new_fpu->state); + __thread_fpu_begin(new_fpu); + } + } + return fpu; +} + +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) +{ + if (fpu_switch.preload) { + if (unlikely(restore_fpu_checking(new_fpu))) + fpu_reset_state(new_fpu); + } +} + +/* + * Signal frame handlers... + */ +extern int save_xstate_sig(void __user *buf, void __user *fx, int size); +extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); + +static inline int xstate_sigframe_size(void) +{ + return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; +} + +static inline int restore_xstate_sig(void __user *buf, int ia32_frame) +{ + void __user *buf_fx = buf; + int size = xstate_sigframe_size(); + + if (ia32_frame && use_fxsr()) { + buf_fx = buf + sizeof(struct i387_fsave_struct); + size += sizeof(struct i387_fsave_struct); + } + + return __restore_xstate_sig(buf, buf_fx, size); +} + +/* + * Needs to be preemption-safe. + * + * NOTE! user_fpu_begin() must be used only immediately before restoring + * the save state. It does not do any saving/restoring on its own. In + * lazy FPU mode, it is just an optimization to avoid a #NM exception, + * the task can lose the FPU right after preempt_enable(). + */ +static inline void user_fpu_begin(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + preempt_disable(); + if (!user_has_fpu()) + __thread_fpu_begin(fpu); + preempt_enable(); +} + +/* + * i387 state interaction + */ +static inline unsigned short get_fpu_cwd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.fpu.state->fxsave.cwd; + } else { + return (unsigned short)tsk->thread.fpu.state->fsave.cwd; + } +} + +static inline unsigned short get_fpu_swd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.fpu.state->fxsave.swd; + } else { + return (unsigned short)tsk->thread.fpu.state->fsave.swd; + } +} + +static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) +{ + if (cpu_has_xmm) { + return tsk->thread.fpu.state->fxsave.mxcsr; + } else { + return MXCSR_DEFAULT; + } +} + +extern void fpstate_cache_init(void); + +extern int fpstate_alloc(struct fpu *fpu); +extern void fpstate_free(struct fpu *fpu); +extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); + +static inline unsigned long +alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, + unsigned long *size) +{ + unsigned long frame_size = xstate_sigframe_size(); + + *buf_fx = sp = round_down(sp - frame_size, 64); + if (ia32_frame && use_fxsr()) { + frame_size += sizeof(struct i387_fsave_struct); + sp -= sizeof(struct i387_fsave_struct); + } + + *size = frame_size; + return sp; +} + +#endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 88bb7a75f5c6..8f6a4ea39657 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 176f69b24358..30016f03f25d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -5,7 +5,7 @@ * General FPU state handling cleanups * Gareth Hughes , May 2000 */ -#include +#include /* * Track whether the kernel is using the FPU state diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 4eabb426e910..33df056b1624 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -1,7 +1,7 @@ /* * x86 FPU boot time init code */ -#include +#include #include /* diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 8aa3b864a2e0..4ff726e4e29b 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 51ad3422e728..71f4b4d2f1fd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 7adc314b5075..deff651835b4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_MATH_EMULATION #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4504569c6c4e..c50e013b57d2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 69451b8965f7..c14a00f54b61 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index bcb853e44d30..c67f96c87938 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 60e331ceb844..29f105f0d9fb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,7 +68,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8abcd6a6f3dc..a65586edbb57 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,7 +54,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 479d4ce25081..91d7f3b1e50c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -60,7 +60,7 @@ #include #include #include -#include /* Ugh! */ +#include /* Ugh! */ #include #include #include diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 412b5f81e547..5563be313fd6 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include static const char *mpx_mapping_name(struct vm_area_struct *vma) { diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 757678fb26e1..edaf934c749e 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -21,7 +21,7 @@ #include #include #include -#include /* pcntxt_mask */ +#include /* pcntxt_mask */ #include #ifdef CONFIG_X86_32 -- cgit v1.2.1 From df6397525cbe096d0eab0c1530fd25429d26f11b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 03:06:56 +0200 Subject: x86/fpu: Move MXCSR_DEFAULT to fpu/internal.h fpu/types.h gets included everywhere, move the MXCSR_DEFAULT to fpu/internal.h, the place where it's used. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 ++ arch/x86/include/asm/fpu/types.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 386a8837c358..0e9a7a37801a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -33,6 +33,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, # define ia32_setup_rt_frame __setup_rt_frame #endif +#define MXCSR_DEFAULT 0x1f80 + extern unsigned int mxcsr_feature_mask; extern void fpu__cpu_init(void); extern void eager_fpu_init(void); diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index cad1c37d9ea2..917d2e56426a 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -4,8 +4,6 @@ #ifndef _ASM_X86_FPU_H #define _ASM_X86_FPU_H -#define MXCSR_DEFAULT 0x1f80 - struct i387_fsave_struct { u32 cwd; /* FPU Control Word */ u32 swd; /* FPU Status Word */ -- cgit v1.2.1 From c0841e34fd9bbcef58c537151de8c83d2c910099 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 03:18:28 +0200 Subject: x86/fpu: Remove xsave_init() __init obfuscation So this code surprised me - and being surprised when reading FPU code does not help maintainability of an already overly complex subsystem. Remove the obfuscation and just don't use __init annotation for now. Anyone who wants to free these ~600 bytes of xstate_enable_boot_cpu() should implement it cleanly. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 4ff726e4e29b..6c1cbb2487fe 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -606,8 +606,11 @@ static void __init init_xstate_size(void) /* * Enable and initialize the xsave feature. + * + * ( Not marked __init because of false positive section warnings + * generated by xsave_init(). ) */ -static void __init xstate_enable_boot_cpu(void) +static void /* __init */ xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; @@ -663,21 +666,20 @@ static void __init xstate_enable_boot_cpu(void) /* * For the very first instance, this calls xstate_enable_boot_cpu(); * for all subsequent instances, this calls xstate_enable(). - * - * This is somewhat obfuscated due to the lack of powerful enough - * overrides for the section checks. */ void xsave_init(void) { - static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; - void (*this_func)(void); + static char on_boot_cpu = 1; if (!cpu_has_xsave) return; - this_func = next_func; - next_func = xstate_enable; - this_func(); + if (on_boot_cpu) { + on_boot_cpu = 0; + xstate_enable_boot_cpu(); + } else { + xstate_enable(); + } } /* -- cgit v1.2.1 From 7b302e6731ac37fc688bc3085c11f5a886c90c08 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 02:39:22 +0200 Subject: x86/fpu: Remove assembly guard from asm/fpu/api.h asm/fpu/api.h does not contain any defines useful to assembly code, and no assembly code includes asm/fpu/api.h. Remove the historic #ifndef __ASSEMBLY__ leftover guard. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/api.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 9d3a6f3cfc1b..f1eddcccba16 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -10,8 +10,6 @@ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H -#ifndef __ASSEMBLY__ - #include #include @@ -102,6 +100,4 @@ static inline int user_has_fpu(void) extern void fpu__save(struct fpu *fpu); -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_FPU_API_H */ -- cgit v1.2.1 From 32d4d9ccb0f3209425d6e8aefa484bf00a5dc183 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 03:25:18 +0200 Subject: x86/fpu: Improve FPU detection kernel messages Standardize the various boot time messages printed during FPU detection: - Use a common 'x86/fpu: ' prefix for consistency and to make it easy to grep boot logs for FPU related messages - Correct speling errors - Add printout for the legacy FPU case as well - Clarify messages Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 6c1cbb2487fe..bfe92f73bf86 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -3,9 +3,6 @@ * * Author: Suresh Siddha */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include @@ -615,7 +612,7 @@ static void /* __init */ xstate_enable_boot_cpu(void) unsigned int eax, ebx, ecx, edx; if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { - WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); + WARN(1, "x86/fpu: XSTATE_CPUID missing!\n"); return; } @@ -623,8 +620,7 @@ static void /* __init */ xstate_enable_boot_cpu(void) pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - pr_err("FP/SSE not shown under xsave features 0x%llx\n", - pcntxt_mask); + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", pcntxt_mask); BUG(); } @@ -650,17 +646,18 @@ static void /* __init */ xstate_enable_boot_cpu(void) if (pcntxt_mask & XSTATE_EAGER) { if (eagerfpu == DISABLE) { - pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n", - pcntxt_mask & XSTATE_EAGER); + pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", + pcntxt_mask & XSTATE_EAGER); pcntxt_mask &= ~XSTATE_EAGER; } else { eagerfpu = ENABLE; } } - pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n", - pcntxt_mask, xstate_size, - cpu_has_xsaves ? "compacted form" : "standard form"); + pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", + pcntxt_mask, + xstate_size, + cpu_has_xsaves ? "compacted" : "standard"); } /* @@ -671,8 +668,13 @@ void xsave_init(void) { static char on_boot_cpu = 1; - if (!cpu_has_xsave) + if (!cpu_has_xsave) { + if (on_boot_cpu) { + on_boot_cpu = 0; + pr_info("x86/fpu: Legacy x87 FPU detected.\n"); + } return; + } if (on_boot_cpu) { on_boot_cpu = 0; -- cgit v1.2.1 From 69496e10f87cec182a9263046c1b251235b8c38e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 08:48:01 +0200 Subject: x86/fpu: Print supported xstate features in human readable way Inform the user/admin about which xstate features the kernel supports. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index bfe92f73bf86..f39882b7281b 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -482,6 +482,30 @@ static void __init setup_xstate_features(void) } while (1); } +static void print_xstate_feature(u64 xstate_mask, const char *desc) +{ + if (pcntxt_mask & xstate_mask) { + int xstate_feature = fls64(xstate_mask)-1; + + pr_info("x86/fpu: Supporting XSAVE feature %2d: '%s'\n", xstate_feature, desc); + } +} + +/* + * Print out all the supported xstate features: + */ +static void print_xstate_features(void) +{ + print_xstate_feature(XSTATE_FP, "x87 floating point registers"); + print_xstate_feature(XSTATE_SSE, "SSE registers"); + print_xstate_feature(XSTATE_YMM, "AVX registers"); + print_xstate_feature(XSTATE_BNDREGS, "MPX bounds registers"); + print_xstate_feature(XSTATE_BNDCSR, "MPX CSR"); + print_xstate_feature(XSTATE_OPMASK, "AVX-512 opmask"); + print_xstate_feature(XSTATE_ZMM_Hi256, "AVX-512 Hi256"); + print_xstate_feature(XSTATE_Hi16_ZMM, "AVX-512 ZMM_Hi256"); +} + /* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format @@ -545,6 +569,7 @@ static void __init setup_init_fpu_buf(void) return; setup_xstate_features(); + print_xstate_features(); if (cpu_has_xsaves) { init_xstate_buf->xsave_hdr.xcomp_bv = -- cgit v1.2.1 From 614df7fb8a661b0881f9709206350b59de3f84ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 09:20:33 +0200 Subject: x86/fpu: Rename 'pcntxt_mask' to 'xfeatures_mask' So the 'pcntxt_mask' is a misnomer, it's essentially meaningless to anyone who doesn't know what it does exactly. Name it more descriptively as 'xfeatures_mask'. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xsave.h | 2 +- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/xsave.c | 58 ++++++++++++++++++++-------------------- arch/x86/power/cpu.c | 4 +-- 4 files changed, 33 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xsave.h b/arch/x86/include/asm/fpu/xsave.h index 7c90ea93c54e..400d5b2e42eb 100644 --- a/arch/x86/include/asm/fpu/xsave.h +++ b/arch/x86/include/asm/fpu/xsave.h @@ -45,7 +45,7 @@ #endif extern unsigned int xstate_size; -extern u64 pcntxt_mask; +extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern struct xsave_struct *init_xstate_buf; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 30016f03f25d..8ea9ce090267 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -528,7 +528,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, * mxcsr reserved bits must be masked to zero for security reasons. */ xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->xsave_hdr.xstate_bv &= pcntxt_mask; + xsave->xsave_hdr.xstate_bv &= xfeatures_mask; /* * These bits must be zero. */ diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index f39882b7281b..c0e95538d689 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -13,9 +13,9 @@ #include /* - * Supported feature mask by the CPU and the kernel. + * Mask of xstate features supported by the CPU and the kernel: */ -u64 pcntxt_mask; +u64 xfeatures_mask; /* * Represents init state for the supported extended state. @@ -24,7 +24,7 @@ struct xsave_struct *init_xstate_buf; static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; static unsigned int *xstate_offsets, *xstate_sizes; -static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8]; +static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; static unsigned int xstate_features; /* @@ -52,7 +52,7 @@ void __sanitize_i387_state(struct task_struct *tsk) * None of the feature bits are in init state. So nothing else * to do for us, as the memory layout is up to date. */ - if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + if ((xstate_bv & xfeatures_mask) == xfeatures_mask) return; /* @@ -74,7 +74,7 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!(xstate_bv & XSTATE_SSE)) memset(&fx->xmm_space[0], 0, 256); - xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + xstate_bv = (xfeatures_mask & ~xstate_bv) >> 2; /* * Update all the other memory layouts for which the corresponding @@ -291,7 +291,7 @@ sanitize_restored_xstate(struct task_struct *tsk, if (fx_only) xsave_hdr->xstate_bv = XSTATE_FPSSE; else - xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv); + xsave_hdr->xstate_bv &= (xfeatures_mask & xstate_bv); } if (use_fxsr()) { @@ -312,11 +312,11 @@ static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) { if (use_xsave()) { if ((unsigned long)buf % 64 || fx_only) { - u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; + u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; xrstor_state(init_xstate_buf, init_bv); return fxrstor_user(buf); } else { - u64 init_bv = pcntxt_mask & ~xbv; + u64 init_bv = xfeatures_mask & ~xbv; if (unlikely(init_bv)) xrstor_state(init_xstate_buf, init_bv); return xrestore_user(buf, xbv); @@ -439,7 +439,7 @@ static void prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; - fx_sw_reserved.xstate_bv = pcntxt_mask; + fx_sw_reserved.xstate_bv = xfeatures_mask; fx_sw_reserved.xstate_size = xstate_size; if (config_enabled(CONFIG_IA32_EMULATION)) { @@ -454,7 +454,7 @@ static void prepare_fx_sw_frame(void) static inline void xstate_enable(void) { cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); } /* @@ -465,7 +465,7 @@ static void __init setup_xstate_features(void) { int eax, ebx, ecx, edx, leaf = 0x2; - xstate_features = fls64(pcntxt_mask); + xstate_features = fls64(xfeatures_mask); xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); @@ -484,7 +484,7 @@ static void __init setup_xstate_features(void) static void print_xstate_feature(u64 xstate_mask, const char *desc) { - if (pcntxt_mask & xstate_mask) { + if (xfeatures_mask & xstate_mask) { int xstate_feature = fls64(xstate_mask)-1; pr_info("x86/fpu: Supporting XSAVE feature %2d: '%s'\n", xstate_feature, desc); @@ -516,7 +516,7 @@ static void print_xstate_features(void) */ void setup_xstate_comp(void) { - unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8]; + unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; int i; /* @@ -529,7 +529,7 @@ void setup_xstate_comp(void) if (!cpu_has_xsaves) { for (i = 2; i < xstate_features; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) { + if (test_bit(i, (unsigned long *)&xfeatures_mask)) { xstate_comp_offsets[i] = xstate_offsets[i]; xstate_comp_sizes[i] = xstate_sizes[i]; } @@ -540,7 +540,7 @@ void setup_xstate_comp(void) xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = 2; i < xstate_features; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) + if (test_bit(i, (unsigned long *)&xfeatures_mask)) xstate_comp_sizes[i] = xstate_sizes[i]; else xstate_comp_sizes[i] = 0; @@ -573,8 +573,8 @@ static void __init setup_init_fpu_buf(void) if (cpu_has_xsaves) { init_xstate_buf->xsave_hdr.xcomp_bv = - (u64)1 << 63 | pcntxt_mask; - init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask; + (u64)1 << 63 | xfeatures_mask; + init_xstate_buf->xsave_hdr.xstate_bv = xfeatures_mask; } /* @@ -604,7 +604,7 @@ __setup("eagerfpu=", eager_fpu_setup); /* - * Calculate total size of enabled xstates in XCR0/pcntxt_mask. + * Calculate total size of enabled xstates in XCR0/xfeatures_mask. */ static void __init init_xstate_size(void) { @@ -619,7 +619,7 @@ static void __init init_xstate_size(void) xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = 2; i < 64; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) { + if (test_bit(i, (unsigned long *)&xfeatures_mask)) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_size += eax; } @@ -642,17 +642,17 @@ static void /* __init */ xstate_enable_boot_cpu(void) } cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - pcntxt_mask = eax + ((u64)edx << 32); + xfeatures_mask = eax + ((u64)edx << 32); - if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", pcntxt_mask); + if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); BUG(); } /* * Support only the state known to OS. */ - pcntxt_mask = pcntxt_mask & XCNTXT_MASK; + xfeatures_mask = xfeatures_mask & XCNTXT_MASK; xstate_enable(); @@ -661,7 +661,7 @@ static void /* __init */ xstate_enable_boot_cpu(void) */ init_xstate_size(); - update_regset_xstate_info(xstate_size, pcntxt_mask); + update_regset_xstate_info(xstate_size, xfeatures_mask); prepare_fx_sw_frame(); setup_init_fpu_buf(); @@ -669,18 +669,18 @@ static void /* __init */ xstate_enable_boot_cpu(void) if (cpu_has_xsaveopt && eagerfpu != DISABLE) eagerfpu = ENABLE; - if (pcntxt_mask & XSTATE_EAGER) { + if (xfeatures_mask & XSTATE_EAGER) { if (eagerfpu == DISABLE) { pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - pcntxt_mask & XSTATE_EAGER); - pcntxt_mask &= ~XSTATE_EAGER; + xfeatures_mask & XSTATE_EAGER); + xfeatures_mask &= ~XSTATE_EAGER; } else { eagerfpu = ENABLE; } } pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", - pcntxt_mask, + xfeatures_mask, xstate_size, cpu_has_xsaves ? "compacted" : "standard"); } @@ -749,7 +749,7 @@ void __init_refok eager_fpu_init(void) void *get_xsave_addr(struct xsave_struct *xsave, int xstate) { int feature = fls64(xstate) - 1; - if (!test_bit(feature, (unsigned long *)&pcntxt_mask)) + if (!test_bit(feature, (unsigned long *)&xfeatures_mask)) return NULL; return (void *)xsave + xstate_comp_offsets[feature]; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index edaf934c749e..62054acbd0d8 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -21,7 +21,7 @@ #include #include #include -#include /* pcntxt_mask */ +#include /* xfeatures_mask */ #include #ifdef CONFIG_X86_32 @@ -225,7 +225,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) * restore XCR0 for xsave capable cpu's. */ if (cpu_has_xsave) - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); fix_processor_context(); -- cgit v1.2.1 From 84246fe4e3a0e412a9602983ba37f4e4dbebb3e8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 09:23:59 +0200 Subject: x86/fpu: Rename 'xstate_features' to 'xfeatures_nr' The name 'xstate_features' does not tell us whether it's a bitmap or any other value. That it's a count of features is only obvious if you read the code that calculates it. Rename it to the more descriptive 'xfeatures_nr' name. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index c0e95538d689..1d0e27128f18 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -25,7 +25,9 @@ struct xsave_struct *init_xstate_buf; static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; static unsigned int *xstate_offsets, *xstate_sizes; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; -static unsigned int xstate_features; + +/* The number of supported xfeatures in xfeatures_mask: */ +static unsigned int xfeatures_nr; /* * If a processor implementation discern that a processor state component is @@ -465,9 +467,9 @@ static void __init setup_xstate_features(void) { int eax, ebx, ecx, edx, leaf = 0x2; - xstate_features = fls64(xfeatures_mask); - xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); - xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + xfeatures_nr = fls64(xfeatures_mask); + xstate_offsets = alloc_bootmem(xfeatures_nr * sizeof(int)); + xstate_sizes = alloc_bootmem(xfeatures_nr * sizeof(int)); do { cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); @@ -528,7 +530,7 @@ void setup_xstate_comp(void) xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); if (!cpu_has_xsaves) { - for (i = 2; i < xstate_features; i++) { + for (i = 2; i < xfeatures_nr; i++) { if (test_bit(i, (unsigned long *)&xfeatures_mask)) { xstate_comp_offsets[i] = xstate_offsets[i]; xstate_comp_sizes[i] = xstate_sizes[i]; @@ -539,7 +541,7 @@ void setup_xstate_comp(void) xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < xstate_features; i++) { + for (i = 2; i < xfeatures_nr; i++) { if (test_bit(i, (unsigned long *)&xfeatures_mask)) xstate_comp_sizes[i] = xstate_sizes[i]; else -- cgit v1.2.1 From 9254aaa0fea4e8aa4e83539c379aecb41a975ca6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 10:02:32 +0200 Subject: x86/fpu: Move XCR0 manipulation to the FPU code proper The suspend code accesses FPU state internals, add a helper for it and isolate it. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/api.h | 1 + arch/x86/kernel/fpu/xsave.c | 12 ++++++++++++ arch/x86/power/cpu.c | 10 ++-------- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index f1eddcccba16..5bdde8ca87bc 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -23,6 +23,7 @@ extern void fpu__clear(struct task_struct *tsk); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void fpu__restore(void); extern void fpu__init_check_bugs(void); +extern void fpu__resume_cpu(void); extern bool irq_fpu_usable(void); diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 1d0e27128f18..a485180ebc32 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -734,6 +734,18 @@ void __init_refok eager_fpu_init(void) setup_init_fpu_buf(); } +/* + * Restore minimal FPU state after suspend: + */ +void fpu__resume_cpu(void) +{ + /* + * Restore XCR0 on xsave capable CPUs: + */ + if (cpu_has_xsave) + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); +} + /* * Given the xsave area and a state inside, this function returns the * address of the state. diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 62054acbd0d8..ad0ce6b70fac 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -18,10 +18,8 @@ #include #include #include -#include #include #include -#include /* xfeatures_mask */ #include #ifdef CONFIG_X86_32 @@ -155,6 +153,8 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_LDT(¤t->active_mm->context); /* This does lldt */ + + fpu__resume_cpu(); } /** @@ -221,12 +221,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); #endif - /* - * restore XCR0 for xsave capable cpu's. - */ - if (cpu_has_xsave) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); - fix_processor_context(); do_fpu_end(); -- cgit v1.2.1 From 8dcea8db793150ba7d56d56f0a397260db490abe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 10:11:24 +0200 Subject: x86/fpu: Clean up regset functions Clean up various regset handlers: use the 'fpu' pointer which is available in most cases. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8ea9ce090267..07c8a4489e0c 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -438,7 +438,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, sanitize_i387_state(target); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fxsave, 0, -1); + &fpu->state->fxsave, 0, -1); } int xfpregs_set(struct task_struct *target, const struct user_regset *regset, @@ -458,19 +458,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, sanitize_i387_state(target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fxsave, 0, -1); + &fpu->state->fxsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. */ - target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; + fpu->state->fxsave.mxcsr &= mxcsr_feature_mask; /* * update the header bits in the xsave header, indicating the * presence of FP and SSE state. */ if (cpu_has_xsave) - target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + fpu->state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; return ret; } @@ -490,7 +490,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - xsave = &target->thread.fpu.state->xsave; + xsave = &fpu->state->xsave; /* * Copy the 48bytes defined by the software first into the xstate @@ -521,7 +521,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - xsave = &target->thread.fpu.state->xsave; + xsave = &fpu->state->xsave; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); /* @@ -533,6 +533,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, * These bits must be zero. */ memset(&xsave->xsave_hdr.reserved, 0, 48); + return ret; } @@ -690,7 +691,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fsave, 0, + &fpu->state->fsave, 0, -1); sanitize_i387_state(target); @@ -724,7 +725,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fsave, 0, + &fpu->state->fsave, 0, -1); if (pos > 0 || count < sizeof(env)) @@ -739,7 +740,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP. */ if (cpu_has_xsave) - target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; + fpu->state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; return ret; } -- cgit v1.2.1 From 3a54450b5ed1671a6adecf501a0b4d4c1d27235d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 10:14:36 +0200 Subject: x86/fpu: Rename 'xsave_hdr' to 'header' Code like: fpu->state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; is an eyesore, because not only is the words 'xsave' and 'state' are repeated twice times (!), but also because of the 'hdr' and 'bv' abbreviations that are pretty meaningless at a first glance. Start cleaning this up by renaming 'xsave_hdr' to 'header'. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/include/asm/fpu/types.h | 4 ++-- arch/x86/include/asm/fpu/xsave.h | 2 +- arch/x86/include/asm/user.h | 10 +++++----- arch/x86/include/uapi/asm/sigcontext.h | 4 ++-- arch/x86/kernel/fpu/core.c | 8 ++++---- arch/x86/kernel/fpu/xsave.c | 22 +++++++++++----------- arch/x86/kvm/x86.c | 8 ++++---- 8 files changed, 30 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0e9a7a37801a..3007df99833e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -261,7 +261,7 @@ static inline int fpu_save_init(struct fpu *fpu) /* * xsave header may indicate the init state of the FP. */ - if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) + if (!(fpu->state->xsave.header.xstate_bv & XSTATE_FP)) return 1; } else if (use_fxsr()) { fpu_fxsave(fpu); diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 917d2e56426a..33c0c7b782db 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -99,7 +99,7 @@ struct bndcsr { u64 bndstatus; } __packed; -struct xsave_hdr_struct { +struct xstate_header { u64 xstate_bv; u64 xcomp_bv; u64 reserved[6]; @@ -107,7 +107,7 @@ struct xsave_hdr_struct { struct xsave_struct { struct i387_fxsave_struct i387; - struct xsave_hdr_struct xsave_hdr; + struct xstate_header header; struct ymmh_struct ymmh; struct lwp_struct lwp; struct bndreg bndreg[4]; diff --git a/arch/x86/include/asm/fpu/xsave.h b/arch/x86/include/asm/fpu/xsave.h index 400d5b2e42eb..b27b4466f88d 100644 --- a/arch/x86/include/asm/fpu/xsave.h +++ b/arch/x86/include/asm/fpu/xsave.h @@ -212,7 +212,7 @@ static inline int xsave_user(struct xsave_struct __user *buf) * Clear the xsave header first, so that reserved fields are * initialized to zero. */ - err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr)); + err = __clear_user(&buf->header, sizeof(buf->header)); if (unlikely(err)) return -EFAULT; diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index ccab4af1646d..fa042410c42c 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -14,7 +14,7 @@ struct user_ymmh_regs { __u32 ymmh_space[64]; }; -struct user_xsave_hdr { +struct user_xstate_header { __u64 xstate_bv; __u64 reserved1[2]; __u64 reserved2[5]; @@ -41,11 +41,11 @@ struct user_xsave_hdr { * particular process/thread. * * Also when the user modifies certain state FP/SSE/etc through the - * ptrace interface, they must ensure that the xsave_hdr.xstate_bv + * ptrace interface, they must ensure that the header.xstate_bv * bytes[512..519] of the memory layout are updated correspondingly. * i.e., for example when FP state is modified to a non-init state, - * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to - * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc. + * header.xstate_bv's bit 0 must be set to '1', when SSE is modified to + * non-init state, header.xstate_bv's bit 1 must to be set to '1', etc. */ #define USER_XSTATE_FX_SW_WORDS 6 #define USER_XSTATE_XCR0_WORD 0 @@ -55,7 +55,7 @@ struct user_xstateregs { __u64 fpx_space[58]; __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS]; } i387; - struct user_xsave_hdr xsave_hdr; + struct user_xstate_header header; struct user_ymmh_regs ymmh; /* further processor state extensions go here */ }; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 16dc4e8a2cd3..7f850f7b5c45 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -209,7 +209,7 @@ struct sigcontext { #endif /* !__i386__ */ -struct _xsave_hdr { +struct _header { __u64 xstate_bv; __u64 reserved1[2]; __u64 reserved2[5]; @@ -228,7 +228,7 @@ struct _ymmh_state { */ struct _xstate { struct _fpstate fpstate; - struct _xsave_hdr xstate_hdr; + struct _header xstate_hdr; struct _ymmh_state ymmh; /* new processor state extensions go here */ }; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 07c8a4489e0c..74189f31f1a2 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -470,7 +470,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP and SSE state. */ if (cpu_has_xsave) - fpu->state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + fpu->state->xsave.header.xstate_bv |= XSTATE_FPSSE; return ret; } @@ -528,11 +528,11 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, * mxcsr reserved bits must be masked to zero for security reasons. */ xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->xsave_hdr.xstate_bv &= xfeatures_mask; + xsave->header.xstate_bv &= xfeatures_mask; /* * These bits must be zero. */ - memset(&xsave->xsave_hdr.reserved, 0, 48); + memset(&xsave->header.reserved, 0, 48); return ret; } @@ -740,7 +740,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP. */ if (cpu_has_xsave) - fpu->state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; + fpu->state->xsave.header.xstate_bv |= XSTATE_FP; return ret; } diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index a485180ebc32..03639fa079b0 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -32,7 +32,7 @@ static unsigned int xfeatures_nr; /* * If a processor implementation discern that a processor state component is * in its initialized state it may modify the corresponding bit in the - * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * header.xstate_bv as '0', with out modifying the corresponding memory * layout in the case of xsaveopt. While presenting the xstate information to * the user, we always ensure that the memory layout of a feature will be in * the init state if the corresponding header bit is zero. This is to ensure @@ -48,7 +48,7 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; + xstate_bv = tsk->thread.fpu.state->xsave.header.xstate_bv; /* * None of the feature bits are in init state. So nothing else @@ -106,7 +106,7 @@ static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct i387_fxsave_struct) + - sizeof(struct xsave_hdr_struct); + sizeof(struct xstate_header); unsigned int magic2; if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) @@ -178,7 +178,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) * Read the xstate_bv which we copied (directly from the cpu or * from the state in task struct) to the user buffers. */ - err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); + err |= __get_user(xstate_bv, (__u32 *)&x->header.xstate_bv); /* * For legacy compatible, we always set FP/SSE bits in the bit @@ -193,7 +193,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) */ xstate_bv |= XSTATE_FPSSE; - err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); + err |= __put_user(xstate_bv, (__u32 *)&x->header.xstate_bv); return err; } @@ -280,20 +280,20 @@ sanitize_restored_xstate(struct task_struct *tsk, u64 xstate_bv, int fx_only) { struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; - struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr; + struct xstate_header *header = &xsave->header; if (use_xsave()) { /* These bits must be zero. */ - memset(xsave_hdr->reserved, 0, 48); + memset(header->reserved, 0, 48); /* * Init the state that is not present in the memory * layout and not enabled by the OS. */ if (fx_only) - xsave_hdr->xstate_bv = XSTATE_FPSSE; + header->xstate_bv = XSTATE_FPSSE; else - xsave_hdr->xstate_bv &= (xfeatures_mask & xstate_bv); + header->xstate_bv &= (xfeatures_mask & xstate_bv); } if (use_fxsr()) { @@ -574,9 +574,9 @@ static void __init setup_init_fpu_buf(void) print_xstate_features(); if (cpu_has_xsaves) { - init_xstate_buf->xsave_hdr.xcomp_bv = + init_xstate_buf->header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; - init_xstate_buf->xsave_hdr.xstate_bv = xfeatures_mask; + init_xstate_buf->header.xstate_bv = xfeatures_mask; } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 91d7f3b1e50c..ac24889c8bc3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3197,7 +3197,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; - u64 xstate_bv = xsave->xsave_hdr.xstate_bv; + u64 xstate_bv = xsave->header.xstate_bv; u64 valid; /* @@ -3243,9 +3243,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) memcpy(xsave, src, XSAVE_HDR_OFFSET); /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->xsave_hdr.xstate_bv = xstate_bv; + xsave->header.xstate_bv = xstate_bv; if (cpu_has_xsaves) - xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; + xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* * Copy each region from the non-compacted offset to the @@ -7014,7 +7014,7 @@ int fx_init(struct kvm_vcpu *vcpu) fpstate_init(&vcpu->arch.guest_fpu); if (cpu_has_xsaves) - vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv = + vcpu->arch.guest_fpu.state->xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* -- cgit v1.2.1 From 400e4b209166dcd3e3a155401c57bdc6413bf715 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 10:19:47 +0200 Subject: x86/fpu: Rename xsave.header::xstate_bv to 'xfeatures' 'xsave.header::xstate_bv' is a misnomer - what does 'bv' stand for? It probably comes from the 'XGETBV' instruction name, but I could not find in the Intel documentation where that abbreviation comes from. It could mean 'bit vector' - or something else? But how about - instead of guessing about a weird name - we named the field in an obvious and descriptive way that tells us exactly what it does? So rename it to 'xfeatures', which is a bitmask of the xfeatures that are fpstate_active in that context structure. Eyesore like: fpu->state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; is now much more readable: fpu->state->xsave.header.xfeatures |= XSTATE_FP; Which form is not just infinitely more readable, but is also shorter as well. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/include/asm/fpu/types.h | 2 +- arch/x86/include/asm/user.h | 8 +++--- arch/x86/include/uapi/asm/sigcontext.h | 4 +-- arch/x86/kernel/fpu/core.c | 6 ++-- arch/x86/kernel/fpu/xsave.c | 52 +++++++++++++++++----------------- arch/x86/kvm/x86.c | 4 +-- 7 files changed, 39 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3007df99833e..07c6adc02f68 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -261,7 +261,7 @@ static inline int fpu_save_init(struct fpu *fpu) /* * xsave header may indicate the init state of the FP. */ - if (!(fpu->state->xsave.header.xstate_bv & XSTATE_FP)) + if (!(fpu->state->xsave.header.xfeatures & XSTATE_FP)) return 1; } else if (use_fxsr()) { fpu_fxsave(fpu); diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 33c0c7b782db..9bd2cd1a19fd 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -100,7 +100,7 @@ struct bndcsr { } __packed; struct xstate_header { - u64 xstate_bv; + u64 xfeatures; u64 xcomp_bv; u64 reserved[6]; } __attribute__((packed)); diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index fa042410c42c..59a54e869f15 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -15,7 +15,7 @@ struct user_ymmh_regs { }; struct user_xstate_header { - __u64 xstate_bv; + __u64 xfeatures; __u64 reserved1[2]; __u64 reserved2[5]; }; @@ -41,11 +41,11 @@ struct user_xstate_header { * particular process/thread. * * Also when the user modifies certain state FP/SSE/etc through the - * ptrace interface, they must ensure that the header.xstate_bv + * ptrace interface, they must ensure that the header.xfeatures * bytes[512..519] of the memory layout are updated correspondingly. * i.e., for example when FP state is modified to a non-init state, - * header.xstate_bv's bit 0 must be set to '1', when SSE is modified to - * non-init state, header.xstate_bv's bit 1 must to be set to '1', etc. + * header.xfeatures's bit 0 must be set to '1', when SSE is modified to + * non-init state, header.xfeatures's bit 1 must to be set to '1', etc. */ #define USER_XSTATE_FX_SW_WORDS 6 #define USER_XSTATE_XCR0_WORD 0 diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 7f850f7b5c45..0e8a973de9ee 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -25,7 +25,7 @@ struct _fpx_sw_bytes { __u32 extended_size; /* total size of the layout referred by * fpstate pointer in the sigcontext. */ - __u64 xstate_bv; + __u64 xfeatures; /* feature bit mask (including fp/sse/extended * state) that is present in the memory * layout. @@ -210,7 +210,7 @@ struct sigcontext { #endif /* !__i386__ */ struct _header { - __u64 xstate_bv; + __u64 xfeatures; __u64 reserved1[2]; __u64 reserved2[5]; }; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 74189f31f1a2..ae8f26b6b0e5 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -470,7 +470,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP and SSE state. */ if (cpu_has_xsave) - fpu->state->xsave.header.xstate_bv |= XSTATE_FPSSE; + fpu->state->xsave.header.xfeatures |= XSTATE_FPSSE; return ret; } @@ -528,7 +528,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, * mxcsr reserved bits must be masked to zero for security reasons. */ xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->header.xstate_bv &= xfeatures_mask; + xsave->header.xfeatures &= xfeatures_mask; /* * These bits must be zero. */ @@ -740,7 +740,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP. */ if (cpu_has_xsave) - fpu->state->xsave.header.xstate_bv |= XSTATE_FP; + fpu->state->xsave.header.xfeatures |= XSTATE_FP; return ret; } diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 03639fa079b0..467e4635bd29 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -32,7 +32,7 @@ static unsigned int xfeatures_nr; /* * If a processor implementation discern that a processor state component is * in its initialized state it may modify the corresponding bit in the - * header.xstate_bv as '0', with out modifying the corresponding memory + * header.xfeatures as '0', with out modifying the corresponding memory * layout in the case of xsaveopt. While presenting the xstate information to * the user, we always ensure that the memory layout of a feature will be in * the init state if the corresponding header bit is zero. This is to ensure @@ -43,24 +43,24 @@ void __sanitize_i387_state(struct task_struct *tsk) { struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; int feature_bit = 0x2; - u64 xstate_bv; + u64 xfeatures; if (!fx) return; - xstate_bv = tsk->thread.fpu.state->xsave.header.xstate_bv; + xfeatures = tsk->thread.fpu.state->xsave.header.xfeatures; /* * None of the feature bits are in init state. So nothing else * to do for us, as the memory layout is up to date. */ - if ((xstate_bv & xfeatures_mask) == xfeatures_mask) + if ((xfeatures & xfeatures_mask) == xfeatures_mask) return; /* * FP is in init state */ - if (!(xstate_bv & XSTATE_FP)) { + if (!(xfeatures & XSTATE_FP)) { fx->cwd = 0x37f; fx->swd = 0; fx->twd = 0; @@ -73,17 +73,17 @@ void __sanitize_i387_state(struct task_struct *tsk) /* * SSE is in init state */ - if (!(xstate_bv & XSTATE_SSE)) + if (!(xfeatures & XSTATE_SSE)) memset(&fx->xmm_space[0], 0, 256); - xstate_bv = (xfeatures_mask & ~xstate_bv) >> 2; + xfeatures = (xfeatures_mask & ~xfeatures) >> 2; /* * Update all the other memory layouts for which the corresponding * header bit is in the init state. */ - while (xstate_bv) { - if (xstate_bv & 0x1) { + while (xfeatures) { + if (xfeatures & 0x1) { int offset = xstate_offsets[feature_bit]; int size = xstate_sizes[feature_bit]; @@ -92,7 +92,7 @@ void __sanitize_i387_state(struct task_struct *tsk) size); } - xstate_bv >>= 1; + xfeatures >>= 1; feature_bit++; } } @@ -162,7 +162,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) { struct xsave_struct __user *x = buf; struct _fpx_sw_bytes *sw_bytes; - u32 xstate_bv; + u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ @@ -175,25 +175,25 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); /* - * Read the xstate_bv which we copied (directly from the cpu or + * Read the xfeatures which we copied (directly from the cpu or * from the state in task struct) to the user buffers. */ - err |= __get_user(xstate_bv, (__u32 *)&x->header.xstate_bv); + err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. This will * enable us capturing any changes(during sigreturn) to * the FP/SSE bits by the legacy applications which don't touch - * xstate_bv in the xsave header. + * xfeatures in the xsave header. * - * xsave aware apps can change the xstate_bv in the xsave + * xsave aware apps can change the xfeatures in the xsave * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ - xstate_bv |= XSTATE_FPSSE; + xfeatures |= XSTATE_FPSSE; - err |= __put_user(xstate_bv, (__u32 *)&x->header.xstate_bv); + err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); return err; } @@ -277,7 +277,7 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) static inline void sanitize_restored_xstate(struct task_struct *tsk, struct user_i387_ia32_struct *ia32_env, - u64 xstate_bv, int fx_only) + u64 xfeatures, int fx_only) { struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; struct xstate_header *header = &xsave->header; @@ -291,9 +291,9 @@ sanitize_restored_xstate(struct task_struct *tsk, * layout and not enabled by the OS. */ if (fx_only) - header->xstate_bv = XSTATE_FPSSE; + header->xfeatures = XSTATE_FPSSE; else - header->xstate_bv &= (xfeatures_mask & xstate_bv); + header->xfeatures &= (xfeatures_mask & xfeatures); } if (use_fxsr()) { @@ -335,7 +335,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; int state_size = xstate_size; - u64 xstate_bv = 0; + u64 xfeatures = 0; int fx_only = 0; ia32_fxstate &= (config_enabled(CONFIG_X86_32) || @@ -369,7 +369,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) fx_only = 1; } else { state_size = fx_sw_user.xstate_size; - xstate_bv = fx_sw_user.xstate_bv; + xfeatures = fx_sw_user.xfeatures; } } @@ -398,7 +398,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) fpstate_init(fpu); err = -1; } else { - sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); + sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); } fpu->fpstate_active = 1; @@ -415,7 +415,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) * state to the registers directly (with exceptions handled). */ user_fpu_begin(); - if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { + if (restore_user_xstate(buf_fx, xfeatures, fx_only)) { fpu_reset_state(fpu); return -1; } @@ -441,7 +441,7 @@ static void prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; - fx_sw_reserved.xstate_bv = xfeatures_mask; + fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = xstate_size; if (config_enabled(CONFIG_IA32_EMULATION)) { @@ -576,7 +576,7 @@ static void __init setup_init_fpu_buf(void) if (cpu_has_xsaves) { init_xstate_buf->header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; - init_xstate_buf->header.xstate_bv = xfeatures_mask; + init_xstate_buf->header.xfeatures = xfeatures_mask; } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ac24889c8bc3..0b58b9397098 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3197,7 +3197,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; - u64 xstate_bv = xsave->header.xstate_bv; + u64 xstate_bv = xsave->header.xfeatures; u64 valid; /* @@ -3243,7 +3243,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) memcpy(xsave, src, XSAVE_HDR_OFFSET); /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->header.xstate_bv = xstate_bv; + xsave->header.xfeatures = xstate_bv; if (cpu_has_xsaves) xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; -- cgit v1.2.1 From 91a8c2a5b43fc4be4adb4bda50cd331697e289e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 10:49:11 +0200 Subject: x86/fpu: Clean up and fix MXCSR handling The code has the following problems: - it uses a single global 'fx_scratch' area that multiple CPUs could write into simultaneously, in theory. - it wastes 512 bytes of .data for something that is only rarely used. Fix this by moving the state buffer to the stack. Note that while this is 512 bytes, we don't ever call this function in very deep callchains, so its stack usage should not be a problem. Also add comments to explain the magic 0x0000ffbf default value. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 33df056b1624..0b16f61cb2a4 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -68,18 +68,26 @@ void fpu__init_check_bugs(void) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; + unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -static struct i387_fxsave_struct fx_scratch; static void mxcsr_feature_mask_init(void) { - unsigned long mask = 0; + unsigned int mask = 0; if (cpu_has_fxsr) { - memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : "+m" (fx_scratch)); - mask = fx_scratch.mxcsr_mask; + struct i387_fxsave_struct fx_tmp __aligned(32) = { }; + + asm volatile("fxsave %0" : "+m" (fx_tmp)); + + mask = fx_tmp.mxcsr_mask; + + /* + * If zero then use the default features mask, + * which has all features set, except the + * denormals-are-zero feature bit: + */ if (mask == 0) mask = 0x0000ffbf; } -- cgit v1.2.1 From 678eaf603460180260a645de359050fd6568cf74 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 14:48:24 +0200 Subject: x86/fpu: Rename regset FPU register accessors Rename regset accessors to prefix them with 'regset_', because we want to start using the 'fpregs_active' name elsewhere. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 6 +++--- arch/x86/kernel/ptrace.c | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 07c6adc02f68..6eea81c068fb 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -46,17 +46,17 @@ extern void convert_from_fxsr(struct user_i387_ia32_struct *env, extern void convert_to_fxsr(struct task_struct *tsk, const struct user_i387_ia32_struct *env); -extern user_regset_active_fn fpregs_active, xfpregs_active; +extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, xstateregs_set; /* - * xstateregs_active == fpregs_active. Please refer to the comment - * at the definition of fpregs_active. + * xstateregs_active == regset_fpregs_active. Please refer to the comment + * at the definition of regset_fpregs_active. */ -#define xstateregs_active fpregs_active +#define xstateregs_active regset_fpregs_active #ifdef CONFIG_MATH_EMULATION extern void finit_soft_fpu(struct i387_soft_struct *soft); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ae8f26b6b0e5..4978a77269d6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -403,18 +403,18 @@ void fpu__clear(struct task_struct *tsk) } /* - * The xstateregs_active() routine is the same as the fpregs_active() routine, + * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, * as the "regset->n" for the xstate regset will be updated based on the feature * capabilites supported by the xsave. */ -int fpregs_active(struct task_struct *target, const struct user_regset *regset) +int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { struct fpu *target_fpu = &target->thread.fpu; return target_fpu->fpstate_active ? regset->n : 0; } -int xfpregs_active(struct task_struct *target, const struct user_regset *regset) +int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { struct fpu *target_fpu = &target->thread.fpu; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index c14a00f54b61..4c615661ec72 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1296,7 +1296,7 @@ static struct user_regset x86_64_regsets[] __read_mostly = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), - .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set }, [REGSET_XSTATE] = { .core_note_type = NT_X86_XSTATE, @@ -1337,13 +1337,13 @@ static struct user_regset x86_32_regsets[] __read_mostly = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = fpregs_active, .get = fpregs_get, .set = fpregs_set + .active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set }, [REGSET_XFP] = { .core_note_type = NT_PRXFPREG, .n = sizeof(struct user32_fxsr_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set }, [REGSET_XSTATE] = { .core_note_type = NT_X86_XSTATE, -- cgit v1.2.1 From e783e8167ddf275782ef448eb139fafff3ac3af2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 11:43:47 +0200 Subject: x86/fpu: Explain the AVX register layout in the xsave area The previous explanation was rather cryptic. Also transform "u32 [64]" to the more readable "u8[256]" form. No change in implementation. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 9bd2cd1a19fd..8a5120a3b48b 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -78,9 +78,16 @@ struct i387_soft_struct { u32 entry_eip; }; +/* + * There are 16x 256-bit AVX registers named YMM0-YMM15. + * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) + * and are stored in 'struct i387_fxsave_struct::xmm_space[]'. + * + * The high 128 bits are stored here: + * 16x 128 bits == 256 bytes. + */ struct ymmh_struct { - /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ - u32 ymmh_space[64]; + u8 ymmh_space[256]; }; /* We don't support LWP yet: */ -- cgit v1.2.1 From 73a3aeb3ac5312122d5a261c7acb0ab9be93857a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 11:32:59 +0200 Subject: x86/fpu: Improve the __sanitize_i387_state() documentation Improve the comments and add new ones, as this code isn't very obvious. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 467e4635bd29..f3d30f0c50f9 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -30,19 +30,23 @@ static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; static unsigned int xfeatures_nr; /* - * If a processor implementation discern that a processor state component is - * in its initialized state it may modify the corresponding bit in the - * header.xfeatures as '0', with out modifying the corresponding memory - * layout in the case of xsaveopt. While presenting the xstate information to - * the user, we always ensure that the memory layout of a feature will be in - * the init state if the corresponding header bit is zero. This is to ensure - * that the user doesn't see some stale state in the memory layout during - * signal handling, debugging etc. + * When executing XSAVEOPT (optimized XSAVE), if a processor implementation + * detects that an FPU state component is still (or is again) in its + * initialized state, it may clear the corresponding bit in the header.xfeatures + * field, and can skip the writeout of registers to the corresponding memory layout. + * + * This means that when the bit is zero, the state component might still contain + * some previous - non-initialized register state. + * + * Before writing xstate information to user-space we sanitize those components, + * to always ensure that the memory layout of a feature will be in the init state + * if the corresponding header bit is zero. This is to ensure that user-space doesn't + * see some stale state in the memory layout during signal handling, debugging etc. */ void __sanitize_i387_state(struct task_struct *tsk) { struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; - int feature_bit = 0x2; + int feature_bit; u64 xfeatures; if (!fx) @@ -76,19 +80,25 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!(xfeatures & XSTATE_SSE)) memset(&fx->xmm_space[0], 0, 256); + /* + * First two features are FPU and SSE, which above we handled + * in a special way already: + */ + feature_bit = 0x2; xfeatures = (xfeatures_mask & ~xfeatures) >> 2; /* - * Update all the other memory layouts for which the corresponding - * header bit is in the init state. + * Update all the remaining memory layouts according to their + * standard xstate layout, if their header bit is in the init + * state: */ while (xfeatures) { if (xfeatures & 0x1) { int offset = xstate_offsets[feature_bit]; int size = xstate_sizes[feature_bit]; - memcpy(((void *) fx) + offset, - ((void *) init_xstate_buf) + offset, + memcpy((void *)fx + offset, + (void *)init_xstate_buf + offset, size); } -- cgit v1.2.1 From d5cea9b0af1509f170337ba8f47160d0699ff374 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 14:19:26 +0200 Subject: x86/fpu: Rename fpu->has_fpu to fpu->fpregs_active So the current code uses fpu->has_cpu to determine whether a given user FPU context is actively loaded into the FPU's registers [*] and that those registers represent the task's current FPU state. But this term is not unambiguous: especially the distinction between fpu->has_fpu, PF_USED_MATH and fpu_fpregs_owner_ctx is not clear. Increase clarity by unambigously signalling that it's about hardware registers being active right now, by renaming it to fpu->fpregs_active. ( In later patches we'll use more of the 'fpregs' naming, which will make it easier to grep for as well. ) [*] There's the kernel_fpu_begin()/end() primitive that also activates FPU hw registers as well and uses them, without touching the fpu->fpregs_active flag. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/api.h | 2 +- arch/x86/include/asm/fpu/internal.h | 12 ++++++------ arch/x86/include/asm/fpu/types.h | 2 +- arch/x86/kernel/fpu/core.c | 10 +++++----- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 5bdde8ca87bc..4ca745c0d92e 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -96,7 +96,7 @@ static inline void irq_ts_restore(int TS_state) */ static inline int user_has_fpu(void) { - return current->thread.fpu.has_fpu; + return current->thread.fpu.fpregs_active; } extern void fpu__save(struct fpu *fpu); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 6eea81c068fb..b546ec816fd6 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -308,7 +308,7 @@ static inline int restore_fpu_checking(struct fpu *fpu) "fnclex\n\t" "emms\n\t" "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (fpu->has_fpu)); + : : [addr] "m" (fpu->fpregs_active)); } return fpu_restore_checking(fpu); @@ -317,14 +317,14 @@ static inline int restore_fpu_checking(struct fpu *fpu) /* Must be paired with an 'stts' after! */ static inline void __thread_clear_has_fpu(struct fpu *fpu) { - fpu->has_fpu = 0; + fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct fpu *fpu) { - fpu->has_fpu = 1; + fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); } @@ -357,7 +357,7 @@ static inline void drop_fpu(struct fpu *fpu) preempt_disable(); fpu->counter = 0; - if (fpu->has_fpu) { + if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -416,14 +416,14 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) fpu.preload = new_fpu->fpstate_active && (use_eager_fpu() || new_fpu->counter > 5); - if (old_fpu->has_fpu) { + if (old_fpu->fpregs_active) { if (!fpu_save_init(old_fpu)) old_fpu->last_cpu = -1; else old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ - old_fpu->has_fpu = 0; + old_fpu->fpregs_active = 0; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 8a5120a3b48b..231a8f53b2f8 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -142,7 +142,7 @@ struct fpu { */ unsigned int last_cpu; - unsigned int has_fpu; + unsigned int fpregs_active; union thread_xstate *state; /* * This counter contains the number of consecutive context switches diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 4978a77269d6..c8ae838dbf11 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -62,7 +62,7 @@ static bool interrupted_kernel_fpu_idle(void) if (use_eager_fpu()) return true; - return !current->thread.fpu.has_fpu && (read_cr0() & X86_CR0_TS); + return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS); } /* @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->has_fpu) { + if (fpu->fpregs_active) { fpu_save_init(fpu); } else { this_cpu_write(fpu_fpregs_owner_ctx, NULL); @@ -114,7 +114,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->has_fpu) { + if (fpu->fpregs_active) { if (WARN_ON(restore_fpu_checking(fpu))) fpu_reset_state(fpu); } else if (!use_eager_fpu()) { @@ -147,7 +147,7 @@ void fpu__save(struct fpu *fpu) WARN_ON(fpu != ¤t->thread.fpu); preempt_disable(); - if (fpu->has_fpu) { + if (fpu->fpregs_active) { if (use_eager_fpu()) { __save_fpu(fpu); } else { @@ -243,7 +243,7 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { dst_fpu->counter = 0; - dst_fpu->has_fpu = 0; + dst_fpu->fpregs_active = 0; dst_fpu->state = NULL; dst_fpu->last_cpu = -1; -- cgit v1.2.1 From dfaea4e6a27afffb0cc6da7aaaae81abc127fdb3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 14:26:47 +0200 Subject: x86/fpu: Rename __thread_set_has_fpu() to __fpregs_activate() Propagate the 'fpu->fpregs_active' naming to the functions that sets it. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b546ec816fd6..3554a8cdaece 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -322,7 +322,7 @@ static inline void __thread_clear_has_fpu(struct fpu *fpu) } /* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct fpu *fpu) +static inline void __fpregs_activate(struct fpu *fpu) { fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); @@ -346,7 +346,7 @@ static inline void __thread_fpu_begin(struct fpu *fpu) { if (!use_eager_fpu()) clts(); - __thread_set_has_fpu(fpu); + __fpregs_activate(fpu); } static inline void drop_fpu(struct fpu *fpu) @@ -428,7 +428,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { new_fpu->counter++; - __thread_set_has_fpu(new_fpu); + __fpregs_activate(new_fpu); prefetch(new_fpu->state); } else if (!use_eager_fpu()) stts(); -- cgit v1.2.1 From 723c58e428fbcbc9f16864edf740ec3bfaf3703c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 14:28:01 +0200 Subject: x86/fpu: Rename __thread_clear_has_fpu() to __fpregs_deactivate() Propagate the 'fpu->fpregs_active' naming to the functions that clears it. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3554a8cdaece..7a235171be6c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -315,7 +315,7 @@ static inline int restore_fpu_checking(struct fpu *fpu) } /* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct fpu *fpu) +static inline void __fpregs_deactivate(struct fpu *fpu) { fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); @@ -337,7 +337,7 @@ static inline void __fpregs_activate(struct fpu *fpu) */ static inline void __thread_fpu_end(struct fpu *fpu) { - __thread_clear_has_fpu(fpu); + __fpregs_deactivate(fpu); if (!use_eager_fpu()) stts(); } -- cgit v1.2.1 From 232f62cdd7c7162168a445cbc718a82e7f6e36c1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 14:30:38 +0200 Subject: x86/fpu: Rename __thread_fpu_begin() to fpregs_activate() Propagate the 'fpu->fpregs_active' naming to the high level function that sets it. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 6 +++--- arch/x86/kernel/fpu/core.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7a235171be6c..18a62239c73d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -342,7 +342,7 @@ static inline void __thread_fpu_end(struct fpu *fpu) stts(); } -static inline void __thread_fpu_begin(struct fpu *fpu) +static inline void fpregs_activate(struct fpu *fpu) { if (!use_eager_fpu()) clts(); @@ -441,7 +441,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) fpu.preload = 0; else prefetch(new_fpu->state); - __thread_fpu_begin(new_fpu); + fpregs_activate(new_fpu); } } return fpu; @@ -499,7 +499,7 @@ static inline void user_fpu_begin(void) preempt_disable(); if (!user_has_fpu()) - __thread_fpu_begin(fpu); + fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c8ae838dbf11..a1768ec8e643 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -368,9 +368,9 @@ void fpu__restore(void) local_irq_disable(); } - /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */ + /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); - __thread_fpu_begin(fpu); + fpregs_activate(fpu); if (unlikely(restore_fpu_checking(fpu))) { fpu_reset_state(fpu); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); -- cgit v1.2.1 From 66af8e276409196abd59e33919f928e4d002d4f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 14:31:27 +0200 Subject: x86/fpu: Rename __thread_fpu_end() to fpregs_deactivate() Propagate the 'fpu->fpregs_active' naming to the high level function that clears it. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 14 +++++++------- arch/x86/kernel/fpu/core.c | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 18a62239c73d..0292fcc4d441 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -335,18 +335,18 @@ static inline void __fpregs_activate(struct fpu *fpu) * These generally need preemption protection to work, * do try to avoid using these on their own. */ -static inline void __thread_fpu_end(struct fpu *fpu) +static inline void fpregs_activate(struct fpu *fpu) { - __fpregs_deactivate(fpu); if (!use_eager_fpu()) - stts(); + clts(); + __fpregs_activate(fpu); } -static inline void fpregs_activate(struct fpu *fpu) +static inline void fpregs_deactivate(struct fpu *fpu) { + __fpregs_deactivate(fpu); if (!use_eager_fpu()) - clts(); - __fpregs_activate(fpu); + stts(); } static inline void drop_fpu(struct fpu *fpu) @@ -362,7 +362,7 @@ static inline void drop_fpu(struct fpu *fpu) asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(fpu); + fpregs_deactivate(fpu); } fpu->fpstate_active = 0; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a1768ec8e643..3d2ec4bd7f8c 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -152,7 +152,7 @@ void fpu__save(struct fpu *fpu) __save_fpu(fpu); } else { fpu_save_init(fpu); - __thread_fpu_end(fpu); + fpregs_deactivate(fpu); } } preempt_enable(); -- cgit v1.2.1 From 6a133207587bce64efdd0fda9bea09ed994fa690 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 04:29:26 +0200 Subject: x86/fpu: Remove fpstate_xstate_init_size() boot quirk fpstate_xstate_init_size() is called in fpu__cpu_init(), which is run on every CPU, every time they are brought online. But we want to call fpstate_xstate_init_size() only once. Move it to fpu__detect(), which only runs once, on the boot CPU. Also clean up the flow of fpstate_xstate_init_size() a bit, by removing a 'return' from the middle of the function. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 0b16f61cb2a4..3e0fee5bc2e7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -109,13 +109,12 @@ static void fpstate_xstate_init_size(void) setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); xstate_size = sizeof(struct i387_soft_struct); - return; + } else { + if (cpu_has_fxsr) + xstate_size = sizeof(struct i387_fxsave_struct); + else + xstate_size = sizeof(struct i387_fsave_struct); } - - if (cpu_has_fxsr) - xstate_size = sizeof(struct i387_fxsave_struct); - else - xstate_size = sizeof(struct i387_fsave_struct); } /* @@ -151,12 +150,6 @@ void fpu__cpu_init(void) cr0 |= X86_CR0_EM; write_cr0(cr0); - /* - * fpstate_xstate_init_size() is only called once, to avoid overriding - * 'xstate_size' during (secondary CPU) bootup or during CPU hotplug. - */ - if (xstate_size == 0) - fpstate_xstate_init_size(); mxcsr_feature_mask_init(); xsave_init(); @@ -194,5 +187,7 @@ void fpu__detect(struct cpuinfo_x86 *c) else clear_cpu_cap(c, X86_FEATURE_FPU); - /* The final cr0 value is set in fpu_init() */ + /* The final cr0 value is set later, in fpu_init() */ + + fpstate_xstate_init_size(); } -- cgit v1.2.1 From 966ece619eaeae9b5cb6cede7fe6303b7031a51f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 05:04:41 +0200 Subject: x86/fpu: Remove xsave_init() bootmem allocations There's only 8 xstate bits at the moment, and it's not like we can support unknown bits - so put xstate_offsets[] and xstate_sizes[] into static allocation. This is in preparation to be able to call the FPU init code earlier, when there's no bootmem available yet. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xsave.h | 3 +++ arch/x86/kernel/fpu/xsave.c | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xsave.h b/arch/x86/include/asm/fpu/xsave.h index b27b4466f88d..fd564344783e 100644 --- a/arch/x86/include/asm/fpu/xsave.h +++ b/arch/x86/include/asm/fpu/xsave.h @@ -15,6 +15,9 @@ #define XSTATE_ZMM_Hi256 0x40 #define XSTATE_Hi16_ZMM 0x80 +/* The highest xstate bit above (of XSTATE_Hi16_ZMM): */ +#define XFEATURES_NR_MAX 8 + #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) #define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index f3d30f0c50f9..adeab16655ae 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -23,7 +23,7 @@ u64 xfeatures_mask; struct xsave_struct *init_xstate_buf; static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; -static unsigned int *xstate_offsets, *xstate_sizes; +static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX]; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; /* The number of supported xfeatures in xfeatures_mask: */ @@ -478,8 +478,6 @@ static void __init setup_xstate_features(void) int eax, ebx, ecx, edx, leaf = 0x2; xfeatures_nr = fls64(xfeatures_mask); - xstate_offsets = alloc_bootmem(xfeatures_nr * sizeof(int)); - xstate_sizes = alloc_bootmem(xfeatures_nr * sizeof(int)); do { cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); -- cgit v1.2.1 From 26b1f5d05a81a0e60eed718d2d073f050b0afc8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 05:27:26 +0200 Subject: x86/fpu: Make setup_init_fpu_buf() run-once explicitly Remove the dependency on the init_xstate_buf == NULL check to implement once-per-bootup logic in eager_fpu_init(), by making setup_init_fpu_buf() run once per bootup explicitly. This is in preparation to make init_xstate_buf statically allocated. The various boot-once quirks in the FPU init code will be removed in a later cleanup stage. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index adeab16655ae..d11b33514130 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -565,8 +565,14 @@ void setup_xstate_comp(void) /* * setup the xstate image representing the init state */ -static void __init setup_init_fpu_buf(void) +static void setup_init_fpu_buf(void) { + static int on_boot_cpu = 1; + + if (!on_boot_cpu) + return; + on_boot_cpu = 0; + /* * Setup init_xstate_buf to represent the init state of * all the features managed by the xsave @@ -738,8 +744,7 @@ void __init_refok eager_fpu_init(void) return; } - if (!init_xstate_buf) - setup_init_fpu_buf(); + setup_init_fpu_buf(); } /* -- cgit v1.2.1 From 3e5e1267740f47b1616aff5187b668cadd950047 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 05:08:17 +0200 Subject: x86/fpu: Remove 'init_xstate_buf' bootmem allocation Make init_xstate_buf allocated statically at build time. This structure's maximum size is around 1KB - and it's allocated even on most modern embedded x86 CPUs which strive for FPU instruction set parity with desktop and server CPUs, so it's not like we can save much on smaller systems. This removes the last bootmem allocation from the FPU init path, allowing it to be called earlier in the boot sequence. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/include/asm/fpu/xsave.h | 2 +- arch/x86/kernel/fpu/xsave.c | 26 +++++++++++--------------- 3 files changed, 14 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0292fcc4d441..19b7cdf73efd 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -373,9 +373,9 @@ static inline void drop_fpu(struct fpu *fpu) static inline void restore_init_xstate(void) { if (use_xsave()) - xrstor_state(init_xstate_buf, -1); + xrstor_state(&init_xstate_ctx, -1); else - fxrstor_checking(&init_xstate_buf->i387); + fxrstor_checking(&init_xstate_ctx.i387); } /* diff --git a/arch/x86/include/asm/fpu/xsave.h b/arch/x86/include/asm/fpu/xsave.h index fd564344783e..5c3ab4e17aea 100644 --- a/arch/x86/include/asm/fpu/xsave.h +++ b/arch/x86/include/asm/fpu/xsave.h @@ -50,7 +50,7 @@ extern unsigned int xstate_size; extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern struct xsave_struct *init_xstate_buf; +extern struct xsave_struct init_xstate_ctx; extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index d11b33514130..45130ba6f328 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -3,7 +3,6 @@ * * Author: Suresh Siddha */ -#include #include #include #include @@ -20,7 +19,7 @@ u64 xfeatures_mask; /* * Represents init state for the supported extended state. */ -struct xsave_struct *init_xstate_buf; +struct xsave_struct init_xstate_ctx; static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX]; @@ -98,7 +97,7 @@ void __sanitize_i387_state(struct task_struct *tsk) int size = xstate_sizes[feature_bit]; memcpy((void *)fx + offset, - (void *)init_xstate_buf + offset, + (void *)&init_xstate_ctx + offset, size); } @@ -325,12 +324,12 @@ static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) if (use_xsave()) { if ((unsigned long)buf % 64 || fx_only) { u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; - xrstor_state(init_xstate_buf, init_bv); + xrstor_state(&init_xstate_ctx, init_bv); return fxrstor_user(buf); } else { u64 init_bv = xfeatures_mask & ~xbv; if (unlikely(init_bv)) - xrstor_state(init_xstate_buf, init_bv); + xrstor_state(&init_xstate_ctx, init_bv); return xrestore_user(buf, xbv); } } else if (use_fxsr()) { @@ -574,12 +573,10 @@ static void setup_init_fpu_buf(void) on_boot_cpu = 0; /* - * Setup init_xstate_buf to represent the init state of + * Setup init_xstate_ctx to represent the init state of * all the features managed by the xsave */ - init_xstate_buf = alloc_bootmem_align(xstate_size, - __alignof__(struct xsave_struct)); - fx_finit(&init_xstate_buf->i387); + fx_finit(&init_xstate_ctx.i387); if (!cpu_has_xsave) return; @@ -588,21 +585,20 @@ static void setup_init_fpu_buf(void) print_xstate_features(); if (cpu_has_xsaves) { - init_xstate_buf->header.xcomp_bv = - (u64)1 << 63 | xfeatures_mask; - init_xstate_buf->header.xfeatures = xfeatures_mask; + init_xstate_ctx.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; + init_xstate_ctx.header.xfeatures = xfeatures_mask; } /* * Init all the features state with header_bv being 0x0 */ - xrstor_state_booting(init_xstate_buf, -1); + xrstor_state_booting(&init_xstate_ctx, -1); /* * Dump the init state again. This is to identify the init state * of any feature which is not represented by all zero's. */ - xsave_state_booting(init_xstate_buf); + xsave_state_booting(&init_xstate_ctx); } static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; @@ -727,7 +723,7 @@ void xsave_init(void) /* * setup_init_fpu_buf() is __init and it is OK to call it here because - * init_xstate_buf will be unset only once during boot. + * init_xstate_ctx will be unset only once during boot. */ void __init_refok eager_fpu_init(void) { -- cgit v1.2.1 From e35f6f14148c09ad534d122ed32722dd431ac184 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 04:34:48 +0200 Subject: x86/fpu: Split fpu__cpu_init() into early-boot and cpu-boot parts There are two kinds of FPU initialization sequences necessary to bring FPU functionality up: once per system bootup activities, such as detection, feature initialization, etc. of attributes that are shared by all CPUs in the system - and per cpu initialization sequences run when a CPU is brought online (either during bootup or during CPU hotplug onlining), such as CR0/CR4 register setting, etc. The FPU code is mixing these roles together, with no clear distinction. Start sorting this out by splitting the main FPU detection routine (fpu__cpu_init()) into two parts: fpu__init_system() for one per system init activities, and fpu__init_cpu() for the per CPU onlining init activities. Note that xstate_init() is called from both variants for the time being, because it has a dual nature as well. We'll fix that in upcoming patches. Just do the split and call it as we used to before, don't introduce any change in initialization behavior yet, beyond duplicate (and harmless) fpu__init_cpu() and xstate_init() calls - which we'll fix in later patches. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 3e0fee5bc2e7..d6234adc8ba0 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -118,13 +118,9 @@ static void fpstate_xstate_init_size(void) } /* - * Called on the boot CPU at bootup to set up the initial FPU state that - * is later cloned into all processes. - * - * Also called on secondary CPUs to set up the FPU state of their - * idle threads. + * Enable all supported FPU features. Called when a CPU is brought online. */ -void fpu__cpu_init(void) +void fpu__init_cpu(void) { unsigned long cr0; unsigned long cr4_mask = 0; @@ -150,12 +146,29 @@ void fpu__cpu_init(void) cr0 |= X86_CR0_EM; write_cr0(cr0); + xsave_init(); +} + +/* + * Called on the boot CPU once per system bootup, to set up the initial FPU state that + * is later cloned into all processes. + */ +void fpu__init_system(void) +{ + /* The FPU has to be operational for some of the later FPU init activities: */ + fpu__init_cpu(); mxcsr_feature_mask_init(); xsave_init(); eager_fpu_init(); } +void fpu__cpu_init(void) +{ + fpu__init_cpu(); + fpu__init_system(); +} + static int __init no_387(char *s) { setup_clear_cpu_cap(X86_FEATURE_FPU); -- cgit v1.2.1 From 55cc4678b7ee2edd3e6a9411250530eb871bc61d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 06:26:36 +0200 Subject: x86/fpu: Make the system/cpu init distinction clear in the xstate code as well Rename existing xstate init functions along the system/cpu init principles: fpu__init_system_xstate(): called once per system bootup fpu__init_cpu_xstate(): called per CPU onlining Also make the fpu__init_cpu_xstate() early code invariant: if xfeatures_mask is not set yet then don't crash but return. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 3 +++ arch/x86/kernel/fpu/xsave.c | 22 ++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 19b7cdf73efd..71d44be5acb1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -39,6 +39,9 @@ extern unsigned int mxcsr_feature_mask; extern void fpu__cpu_init(void); extern void eager_fpu_init(void); +extern void fpu__init_system_xstate(void); +extern void fpu__init_cpu_xstate(void); + DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); extern void convert_from_fxsr(struct user_i387_ia32_struct *env, diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 45130ba6f328..961c25850c7f 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -460,10 +460,14 @@ static void prepare_fx_sw_frame(void) } /* - * Enable the extended processor state save/restore feature + * Enable the extended processor state save/restore feature. + * Called once per CPU onlining. */ -static inline void xstate_enable(void) +void fpu__init_cpu_xstate(void) { + if (!xfeatures_mask) + return; + cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); } @@ -640,11 +644,12 @@ static void __init init_xstate_size(void) /* * Enable and initialize the xsave feature. + * Called once per system bootup. * * ( Not marked __init because of false positive section warnings * generated by xsave_init(). ) */ -static void /* __init */ xstate_enable_boot_cpu(void) +void fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; @@ -666,7 +671,8 @@ static void /* __init */ xstate_enable_boot_cpu(void) */ xfeatures_mask = xfeatures_mask & XCNTXT_MASK; - xstate_enable(); + /* Enable xstate instructions to be able to continue with initialization: */ + fpu__init_cpu_xstate(); /* * Recompute the context size for enabled features @@ -698,8 +704,8 @@ static void /* __init */ xstate_enable_boot_cpu(void) } /* - * For the very first instance, this calls xstate_enable_boot_cpu(); - * for all subsequent instances, this calls xstate_enable(). + * For the very first instance, this calls fpu__init_system_xstate(); + * for all subsequent instances, this calls fpu__init_cpu_xstate(). */ void xsave_init(void) { @@ -715,9 +721,9 @@ void xsave_init(void) if (on_boot_cpu) { on_boot_cpu = 0; - xstate_enable_boot_cpu(); + fpu__init_system_xstate(); } else { - xstate_enable(); + fpu__init_cpu_xstate(); } } -- cgit v1.2.1 From e84611fc96c67d3a073e327be44d0f9ee3e981ef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 06:41:07 +0200 Subject: x86/fpu: Move CPU capability check into fpu__init_cpu_xstate() fpu__init_system_xstate() does an FPU capability check that is better done in fpu__init_cpu_xstate(). This will allow us to call fpu__init_cpu_xstate() directly on legacy CPUs as well. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 961c25850c7f..0610f431f77f 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -465,7 +465,7 @@ static void prepare_fx_sw_frame(void) */ void fpu__init_cpu_xstate(void) { - if (!xfeatures_mask) + if (!cpu_has_xsave || !xfeatures_mask) return; cr4_set_bits(X86_CR4_OSXSAVE); -- cgit v1.2.1 From e9dbfd673a9cc9d3ef90e3ea4e136a833dab3674 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 06:47:24 +0200 Subject: x86/fpu: Move legacy check to fpu__init_system_xstate() Now that legacy code can execute fpu__init_cpu_xstate() in xsave_init(), we can move the once per boot legacy check into fpu__init_system_xstate(), where it belongs. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 0610f431f77f..fd656cbdd315 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -653,6 +653,11 @@ void fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; + if (!cpu_has_xsave) { + pr_info("x86/fpu: Legacy x87 FPU detected.\n"); + return; + } + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { WARN(1, "x86/fpu: XSTATE_CPUID missing!\n"); return; @@ -711,14 +716,6 @@ void xsave_init(void) { static char on_boot_cpu = 1; - if (!cpu_has_xsave) { - if (on_boot_cpu) { - on_boot_cpu = 0; - pr_info("x86/fpu: Legacy x87 FPU detected.\n"); - } - return; - } - if (on_boot_cpu) { on_boot_cpu = 0; fpu__init_system_xstate(); -- cgit v1.2.1 From 62db6871ae862fbd7f7abfebffe501b170698a8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 06:50:09 +0200 Subject: x86/fpu: Propagate once per boot quirk into fpu__init_system_xstate() Linearize the call sequence in xsave_init(): fpu__init_system_xstate(); fpu__init_cpu_xstate(); We do this by propagating the boot-once quirk into fpu__init_system_xstate(). fpu__init_cpu_xstate() is safe to be called multiple time. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index fd656cbdd315..9d5ff90916b1 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -652,6 +652,11 @@ static void __init init_xstate_size(void) void fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; + static bool on_boot_cpu = 1; + + if (!on_boot_cpu) + return; + on_boot_cpu = 0; if (!cpu_has_xsave) { pr_info("x86/fpu: Legacy x87 FPU detected.\n"); @@ -714,14 +719,8 @@ void fpu__init_system_xstate(void) */ void xsave_init(void) { - static char on_boot_cpu = 1; - - if (on_boot_cpu) { - on_boot_cpu = 0; - fpu__init_system_xstate(); - } else { - fpu__init_cpu_xstate(); - } + fpu__init_system_xstate(); + fpu__init_cpu_xstate(); } /* -- cgit v1.2.1 From c42103b22652dae50e1aaacb5c2767145027ab3e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 06:52:53 +0200 Subject: x86/fpu: Remove xsave_init() Expand fpu__init_system_xstate() and fpu__init_cpu_xstate() calls into xsave_init() calls. (This will allow us to call the proper versions in higher level FPU init code later on.) No change in functionality. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xsave.h | 1 - arch/x86/kernel/fpu/init.c | 8 +++++--- arch/x86/kernel/fpu/xsave.c | 13 +------------ 3 files changed, 6 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xsave.h b/arch/x86/include/asm/fpu/xsave.h index 5c3ab4e17aea..a10e66582c1b 100644 --- a/arch/x86/include/asm/fpu/xsave.h +++ b/arch/x86/include/asm/fpu/xsave.h @@ -52,7 +52,6 @@ extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern struct xsave_struct init_xstate_ctx; -extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); /* These macros all use (%edi)/(%rdi) as the single memory argument. */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d6234adc8ba0..77599fe8af56 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -98,7 +98,7 @@ static void fpstate_xstate_init_size(void) { /* * Note that xstate_size might be overwriten later during - * xsave_init(). + * fpu__init_system_xstate(). */ if (!cpu_has_fpu) { @@ -146,7 +146,8 @@ void fpu__init_cpu(void) cr0 |= X86_CR0_EM; write_cr0(cr0); - xsave_init(); + fpu__init_system_xstate(); + fpu__init_cpu_xstate(); } /* @@ -159,7 +160,8 @@ void fpu__init_system(void) fpu__init_cpu(); mxcsr_feature_mask_init(); - xsave_init(); + fpu__init_system_xstate(); + fpu__init_cpu_xstate(); eager_fpu_init(); } diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 9d5ff90916b1..fa9b954eb23a 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -646,8 +646,7 @@ static void __init init_xstate_size(void) * Enable and initialize the xsave feature. * Called once per system bootup. * - * ( Not marked __init because of false positive section warnings - * generated by xsave_init(). ) + * ( Not marked __init because of false positive section warnings. ) */ void fpu__init_system_xstate(void) { @@ -713,16 +712,6 @@ void fpu__init_system_xstate(void) cpu_has_xsaves ? "compacted" : "standard"); } -/* - * For the very first instance, this calls fpu__init_system_xstate(); - * for all subsequent instances, this calls fpu__init_cpu_xstate(). - */ -void xsave_init(void) -{ - fpu__init_system_xstate(); - fpu__init_cpu_xstate(); -} - /* * setup_init_fpu_buf() is __init and it is OK to call it here because * init_xstate_ctx will be unset only once during boot. -- cgit v1.2.1 From 429ced50a0e5f863f95b100749043451e1968c4c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 08:21:08 +0200 Subject: x86/fpu: Do fpu__init_system_xstate only from fpu__init_system() Only call xstate system setup routines from fpu__init_system(). Likewise, don't call fpu__init_cpu_xstate() from fpu__init_system(). Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 77599fe8af56..c1b2d1cfe745 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -146,7 +146,6 @@ void fpu__init_cpu(void) cr0 |= X86_CR0_EM; write_cr0(cr0); - fpu__init_system_xstate(); fpu__init_cpu_xstate(); } @@ -161,7 +160,6 @@ void fpu__init_system(void) mxcsr_feature_mask_init(); fpu__init_system_xstate(); - fpu__init_cpu_xstate(); eager_fpu_init(); } -- cgit v1.2.1 From 2507e1c03f577173d613d6728329eb220724c577 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 08:35:53 +0200 Subject: x86/fpu: Set up the legacy FPU init image from fpu__init_system() The legacy FPU init image is used on older CPUs who don't run xstate init. But the init code is called within setup_init_fpu_buf(), an xstate method. Move this legacy init out of the xstate code and put it into fpu/init.c. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 6 ++++++ arch/x86/kernel/fpu/xsave.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index c1b2d1cfe745..30d2d5d03cb0 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -158,6 +158,12 @@ void fpu__init_system(void) /* The FPU has to be operational for some of the later FPU init activities: */ fpu__init_cpu(); + /* + * Set up the legacy init FPU context. (xstate init might overwrite this + * with a more modern format, if the CPU supports it.) + */ + fx_finit(&init_xstate_ctx.i387); + mxcsr_feature_mask_init(); fpu__init_system_xstate(); eager_fpu_init(); diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index fa9b954eb23a..6be0a98238f6 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -576,12 +576,6 @@ static void setup_init_fpu_buf(void) return; on_boot_cpu = 0; - /* - * Setup init_xstate_ctx to represent the init state of - * all the features managed by the xsave - */ - fx_finit(&init_xstate_ctx.i387); - if (!cpu_has_xsave) return; -- cgit v1.2.1 From a5cb56e9a68aa0e19281ad93f1fce6e6802e3206 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 08:42:47 +0200 Subject: x86/fpu: Remove setup_init_fpu_buf() call from eager_fpu_init() It's a pure xstate method now, no need for this duplicate call. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 6be0a98238f6..097f03e209a6 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -724,8 +724,6 @@ void __init_refok eager_fpu_init(void) stts(); return; } - - setup_init_fpu_buf(); } /* -- cgit v1.2.1 From 89abbe01a46a9f47f8d07e2817704bfcbd8f6f9e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 09:02:51 +0200 Subject: x86/fpu: Move all eager-fpu setup code to eager_fpu_init() The FPU context switch type (lazy or eager) setup code is split into two places currently - move it all to eager_fpu_init(). Note that the code we move will now be executed on non-xstate CPUs as well, but this should be safe: both xfeatures_mask and cpu_has_xsaveopt is 0 there. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xsave.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 097f03e209a6..1b920a170576 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -686,20 +686,6 @@ void fpu__init_system_xstate(void) prepare_fx_sw_frame(); setup_init_fpu_buf(); - /* Auto enable eagerfpu for xsaveopt */ - if (cpu_has_xsaveopt && eagerfpu != DISABLE) - eagerfpu = ENABLE; - - if (xfeatures_mask & XSTATE_EAGER) { - if (eagerfpu == DISABLE) { - pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XSTATE_EAGER); - xfeatures_mask &= ~XSTATE_EAGER; - } else { - eagerfpu = ENABLE; - } - } - pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", xfeatures_mask, xstate_size, @@ -715,6 +701,20 @@ void __init_refok eager_fpu_init(void) WARN_ON(current->thread.fpu.fpstate_active); current_thread_info()->status = 0; + /* Auto enable eagerfpu for xsaveopt */ + if (cpu_has_xsaveopt && eagerfpu != DISABLE) + eagerfpu = ENABLE; + + if (xfeatures_mask & XSTATE_EAGER) { + if (eagerfpu == DISABLE) { + pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", + xfeatures_mask & XSTATE_EAGER); + xfeatures_mask &= ~XSTATE_EAGER; + } else { + eagerfpu = ENABLE; + } + } + if (eagerfpu == ENABLE) setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); -- cgit v1.2.1 From 6f5d265afffb5968e25a8951a085c0467558c073 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 20:11:05 +0200 Subject: x86/fpu: Move eager_fpu_init() to fpu/init.c Move eager_fpu_init() and the 'eagerfpu' boot parameter handling function to the generic FPU init file: it's generic FPU functionality. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 48 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/xsave.c | 48 --------------------------------------------- 2 files changed, 48 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 30d2d5d03cb0..fa9678f13630 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -149,6 +149,54 @@ void fpu__init_cpu(void) fpu__init_cpu_xstate(); } +static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; + +static int __init eager_fpu_setup(char *s) +{ + if (!strcmp(s, "on")) + eagerfpu = ENABLE; + else if (!strcmp(s, "off")) + eagerfpu = DISABLE; + else if (!strcmp(s, "auto")) + eagerfpu = AUTO; + return 1; +} +__setup("eagerfpu=", eager_fpu_setup); + +/* + * setup_init_fpu_buf() is __init and it is OK to call it here because + * init_xstate_ctx will be unset only once during boot. + */ +void __init_refok eager_fpu_init(void) +{ + WARN_ON(current->thread.fpu.fpstate_active); + current_thread_info()->status = 0; + + /* Auto enable eagerfpu for xsaveopt */ + if (cpu_has_xsaveopt && eagerfpu != DISABLE) + eagerfpu = ENABLE; + + if (xfeatures_mask & XSTATE_EAGER) { + if (eagerfpu == DISABLE) { + pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", + xfeatures_mask & XSTATE_EAGER); + xfeatures_mask &= ~XSTATE_EAGER; + } else { + eagerfpu = ENABLE; + } + } + + if (eagerfpu == ENABLE) + setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + + printk_once(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); + + if (!cpu_has_eager_fpu) { + stts(); + return; + } +} + /* * Called on the boot CPU once per system bootup, to set up the initial FPU state that * is later cloned into all processes. diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 1b920a170576..a23236358fb0 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -599,20 +599,6 @@ static void setup_init_fpu_buf(void) xsave_state_booting(&init_xstate_ctx); } -static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; -static int __init eager_fpu_setup(char *s) -{ - if (!strcmp(s, "on")) - eagerfpu = ENABLE; - else if (!strcmp(s, "off")) - eagerfpu = DISABLE; - else if (!strcmp(s, "auto")) - eagerfpu = AUTO; - return 1; -} -__setup("eagerfpu=", eager_fpu_setup); - - /* * Calculate total size of enabled xstates in XCR0/xfeatures_mask. */ @@ -692,40 +678,6 @@ void fpu__init_system_xstate(void) cpu_has_xsaves ? "compacted" : "standard"); } -/* - * setup_init_fpu_buf() is __init and it is OK to call it here because - * init_xstate_ctx will be unset only once during boot. - */ -void __init_refok eager_fpu_init(void) -{ - WARN_ON(current->thread.fpu.fpstate_active); - current_thread_info()->status = 0; - - /* Auto enable eagerfpu for xsaveopt */ - if (cpu_has_xsaveopt && eagerfpu != DISABLE) - eagerfpu = ENABLE; - - if (xfeatures_mask & XSTATE_EAGER) { - if (eagerfpu == DISABLE) { - pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XSTATE_EAGER); - xfeatures_mask &= ~XSTATE_EAGER; - } else { - eagerfpu = ENABLE; - } - } - - if (eagerfpu == ENABLE) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); - - printk_once(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); - - if (!cpu_has_eager_fpu) { - stts(); - return; - } -} - /* * Restore minimal FPU state after suspend: */ -- cgit v1.2.1 From 064e51e3c8aee6cfb03ab75e9f9551db3924eb07 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 08:48:54 +0200 Subject: x86/fpu: Clean up eager_fpu_init() and rename it to fpu__ctx_switch_init() It's not an xsave specific function anymore, so rename it accordingly and also clean it up a bit: - remove the obsolete __init_refok, as the code paths are not mixed anymore - rename it from eager_fpu_init() to fpu__ctx_switch_init() - remove stray 'return;' - make it static to its only user Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index fa9678f13630..d6d582080c3b 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -167,7 +167,7 @@ __setup("eagerfpu=", eager_fpu_setup); * setup_init_fpu_buf() is __init and it is OK to call it here because * init_xstate_ctx will be unset only once during boot. */ -void __init_refok eager_fpu_init(void) +static void fpu__ctx_switch_init(void) { WARN_ON(current->thread.fpu.fpstate_active); current_thread_info()->status = 0; @@ -191,10 +191,8 @@ void __init_refok eager_fpu_init(void) printk_once(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); - if (!cpu_has_eager_fpu) { + if (!cpu_has_eager_fpu) stts(); - return; - } } /* @@ -214,7 +212,7 @@ void fpu__init_system(void) mxcsr_feature_mask_init(); fpu__init_system_xstate(); - eager_fpu_init(); + fpu__ctx_switch_init(); } void fpu__cpu_init(void) -- cgit v1.2.1 From 011545b570be22191047d07299515c1d711eeb38 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 08:28:31 +0200 Subject: x86/fpu: Split fpu__ctx_switch_init() into _cpu() and _system() portions So fpu__ctx_switch_init() has two aspects: a once per bootup functionality that sets up a capability flag, and a per CPU functionality that sets CR0::TS. Split the function. Note that at this stage we still have duplicate calls into these methods, as both the _system() and the _cpu() methods are run on all CPUs, with lower level on_boot_cpu flags filtering out the duplicates where needed. So add TS flag clearing as well, to handle the aftermath of early CPU init sequences that might call in without having eager-fpu set - don't assume the TS flag is cleared. Calling each from its respective init level will happen later on. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d6d582080c3b..2752b4bae854 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -117,6 +117,18 @@ static void fpstate_xstate_init_size(void) } } +/* + * Initialize the TS bit in CR0 according to the style of context-switches + * we are using: + */ +static void fpu__init_cpu_ctx_switch(void) +{ + if (!cpu_has_eager_fpu) + stts(); + else + clts(); +} + /* * Enable all supported FPU features. Called when a CPU is brought online. */ @@ -167,7 +179,7 @@ __setup("eagerfpu=", eager_fpu_setup); * setup_init_fpu_buf() is __init and it is OK to call it here because * init_xstate_ctx will be unset only once during boot. */ -static void fpu__ctx_switch_init(void) +static void fpu__init_system_ctx_switch(void) { WARN_ON(current->thread.fpu.fpstate_active); current_thread_info()->status = 0; @@ -190,9 +202,6 @@ static void fpu__ctx_switch_init(void) setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); printk_once(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); - - if (!cpu_has_eager_fpu) - stts(); } /* @@ -212,7 +221,8 @@ void fpu__init_system(void) mxcsr_feature_mask_init(); fpu__init_system_xstate(); - fpu__ctx_switch_init(); + fpu__init_system_ctx_switch(); + fpu__init_cpu_ctx_switch(); } void fpu__cpu_init(void) -- cgit v1.2.1 From 530b37e43ce3f0bb9969308c0a64901b442f8e0a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 08:27:44 +0200 Subject: x86/fpu: Do CLTS fpu__init_system() mxcsr_feature_mask_init() depends on TS being cleared, as it executes an FXSAVE instruction. After later changes we will move the TS setup into fpu__init_cpu(), which will interact with this - so clear the TS flag explicitly. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 2752b4bae854..567e7e6cdc6b 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -213,6 +213,13 @@ void fpu__init_system(void) /* The FPU has to be operational for some of the later FPU init activities: */ fpu__init_cpu(); + /* + * But don't leave CR0::TS set yet, as some of the FPU setup methods depend + * on being able to execute FPU instructions that will fault on a set TS, + * such as the FXSAVE in mxcsr_feature_mask_init(). + */ + clts(); + /* * Set up the legacy init FPU context. (xstate init might overwrite this * with a more modern format, if the CPU supports it.) -- cgit v1.2.1 From 997578b14c2730226a804d53ce681d3506f2f876 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 10:35:57 +0200 Subject: x86/fpu: Move the fpstate_xstate_init_size() call into fpu__init_system() The fpstate_xstate_init_size() function sets up a basic xstate_size, called during fpu__detect() currently. Its real dependency is to be called before fpu__init_system_xstate(). So move the function call site into fpu__init_system(), to right before the fpu__init_system_xstate() call. Also add a once-per-boot flag to fpstate_xstate_init_size(), we'll remove this quirk later once we've cleaned up the init dependencies. This moves the two related functions closer to each other and makes them both part of the _init_system() functionality. Currently we do the fpstate_xstate_init_size() Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 567e7e6cdc6b..ca3468d8bc31 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -96,6 +96,12 @@ static void mxcsr_feature_mask_init(void) static void fpstate_xstate_init_size(void) { + static bool on_boot_cpu = 1; + + if (!on_boot_cpu) + return; + on_boot_cpu = 0; + /* * Note that xstate_size might be overwriten later during * fpu__init_system_xstate(). @@ -227,7 +233,10 @@ void fpu__init_system(void) fx_finit(&init_xstate_ctx.i387); mxcsr_feature_mask_init(); + + fpstate_xstate_init_size(); fpu__init_system_xstate(); + fpu__init_system_ctx_switch(); fpu__init_cpu_ctx_switch(); } @@ -270,6 +279,4 @@ void fpu__detect(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_FPU); /* The final cr0 value is set later, in fpu_init() */ - - fpstate_xstate_init_size(); } -- cgit v1.2.1 From 3960fccf2e22c3b3d61bd5a45b6172e66e660d8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 08:27:44 +0200 Subject: x86/fpu: Call fpu__init_cpu_ctx_switch() from fpu__init_cpu() fpu__init_cpu() is currently called from fpu__init_system(), which is the wrong place for it: call it from the proper high level per CPU init function, fpu__init_cpu(). Note, we still keep the old call site as well, because it depends on having proper CR0::TS setup. We'll fix this in the next patch. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index ca3468d8bc31..b3ea4f86d643 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -165,6 +165,7 @@ void fpu__init_cpu(void) write_cr0(cr0); fpu__init_cpu_xstate(); + fpu__init_cpu_ctx_switch(); } static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; -- cgit v1.2.1 From 067051ccd209623cb56152cf4cb06616ee2bcc5c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Apr 2015 08:27:44 +0200 Subject: x86/fpu: Do system-wide setup from fpu__detect() fpu__cpu_init() is called on every CPU, so it is the wrong place to call fpu__init_system() from. Call it from fpu__detect(): this is early CPU init code, but we already have CPU features detected, so we can call the system-wide FPU init code from here. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index b3ea4f86d643..6e422cf1e197 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -245,7 +245,6 @@ void fpu__init_system(void) void fpu__cpu_init(void) { fpu__init_cpu(); - fpu__init_system(); } static int __init no_387(char *s) @@ -279,5 +278,6 @@ void fpu__detect(struct cpuinfo_x86 *c) else clear_cpu_cap(c, X86_FEATURE_FPU); + fpu__init_system(); /* The final cr0 value is set later, in fpu_init() */ } -- cgit v1.2.1 From 7202ab46f7392265c1ecbf03f600393bf32a8bdf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Apr 2015 12:17:57 +0200 Subject: x86/fpu: Remove fpu__init_cpu_ctx_switch() call from fpu__init_system() We are now doing the fpu__init_cpu_ctx_switch() call from fpu__init_cpu(), so there's no need to call it from fpu__init_system() anymore. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6e422cf1e197..0c9c1069fba8 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -239,7 +239,6 @@ void fpu__init_system(void) fpu__init_system_xstate(); fpu__init_system_ctx_switch(); - fpu__init_cpu_ctx_switch(); } void fpu__cpu_init(void) -- cgit v1.2.1 From 21c4cd108a1b144ad645355bfee1f8be937f03a2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 14:27:17 +0200 Subject: x86/fpu: Simplify fpu__cpu_init() After the latest round of cleanups, fpu__cpu_init() has become a simple call to fpu__init_cpu(). Rename fpu__init_cpu() to fpu__cpu_init() and remove the extra layer. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/fpu/init.c | 5 ----- arch/x86/xen/enlighten.c | 2 +- 4 files changed, 4 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 71d44be5acb1..4617eeb57004 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -36,7 +36,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, #define MXCSR_DEFAULT 0x1f80 extern unsigned int mxcsr_feature_mask; -extern void fpu__cpu_init(void); +extern void fpu__init_cpu(void); extern void eager_fpu_init(void); extern void fpu__init_system_xstate(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8f6a4ea39657..d28f8ebc506d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1435,7 +1435,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu__cpu_init(); + fpu__init_cpu(); if (is_uv_system()) uv_cpu_init(); @@ -1491,7 +1491,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu__cpu_init(); + fpu__init_cpu(); } #endif diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 0c9c1069fba8..cf27bbed1ba1 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -241,11 +241,6 @@ void fpu__init_system(void) fpu__init_system_ctx_switch(); } -void fpu__cpu_init(void) -{ - fpu__init_cpu(); -} - static int __init no_387(char *s) { setup_clear_cpu_cap(X86_FEATURE_FPU); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 43f8704b7289..98088bf5906a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1423,7 +1423,7 @@ static void xen_pvh_set_cr_flags(int cpu) return; /* * For BSP, PSE PGE are set in probe_page_size_mask(), for APs - * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__cpu_init(). + * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). */ if (cpu_has_pse) cr4_set_bits_and_update_boot(X86_CR4_PSE); -- cgit v1.2.1 From b11316ed9ed9e453562d9f89a39d344331a3ec1d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 14:32:34 +0200 Subject: x86/fpu: Factor out fpu__init_cpu_generic() Factor out the generic bits from fpu__init_cpu(), to create a flat sequence of per CPU initialization function calls: fpu__init_cpu_generic(); fpu__init_cpu_xstate(); fpu__init_cpu_ctx_switch(); Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index cf27bbed1ba1..37e8b139dc31 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -136,9 +136,9 @@ static void fpu__init_cpu_ctx_switch(void) } /* - * Enable all supported FPU features. Called when a CPU is brought online. + * Initialize the registers found in all CPUs, CR0 and CR4: */ -void fpu__init_cpu(void) +static void fpu__init_cpu_generic(void) { unsigned long cr0; unsigned long cr4_mask = 0; @@ -163,7 +163,14 @@ void fpu__init_cpu(void) if (!cpu_has_fpu) cr0 |= X86_CR0_EM; write_cr0(cr0); +} +/* + * Enable all supported FPU features. Called when a CPU is brought online. + */ +void fpu__init_cpu(void) +{ + fpu__init_cpu_generic(); fpu__init_cpu_xstate(); fpu__init_cpu_ctx_switch(); } -- cgit v1.2.1 From 7218e8b723dc9ceb71cf2fbd3d019e9e11b7d3cf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 14:35:54 +0200 Subject: x86/fpu: Factor out fpu__init_system_generic() Factor out the generic bits from fpu__init_system(). Rename mxcsr_feature_mask_init() to fpu__init_system_mxcsr() to bring it in line with the rest of the nomenclature. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 37e8b139dc31..c3f3a89cbbf6 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -72,7 +72,7 @@ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -static void mxcsr_feature_mask_init(void) +static void fpu__init_system_mxcsr(void) { unsigned int mask = 0; @@ -94,6 +94,20 @@ static void mxcsr_feature_mask_init(void) mxcsr_feature_mask &= mask; } +/* + * Once per bootup FPU initialization sequences that will run on most x86 CPUs: + */ +static void fpu__init_system_generic(void) +{ + /* + * Set up the legacy init FPU context. (xstate init might overwrite this + * with a more modern format, if the CPU supports it.) + */ + fx_finit(&init_xstate_ctx.i387); + + fpu__init_system_mxcsr(); +} + static void fpstate_xstate_init_size(void) { static bool on_boot_cpu = 1; @@ -230,18 +244,11 @@ void fpu__init_system(void) /* * But don't leave CR0::TS set yet, as some of the FPU setup methods depend * on being able to execute FPU instructions that will fault on a set TS, - * such as the FXSAVE in mxcsr_feature_mask_init(). + * such as the FXSAVE in fpu__init_system_mxcsr(). */ clts(); - /* - * Set up the legacy init FPU context. (xstate init might overwrite this - * with a more modern format, if the CPU supports it.) - */ - fx_finit(&init_xstate_ctx.i387); - - mxcsr_feature_mask_init(); - + fpu__init_system_generic(); fpstate_xstate_init_size(); fpu__init_system_xstate(); -- cgit v1.2.1 From 2e2f3da7714f323a0db65baa19b2b8110cc23f95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 14:40:54 +0200 Subject: x86/fpu: Factor out fpu__init_system_early_generic() Move the generic bits of fpu__detect() into fpu__init_system_early_generic(). We'll move some other code here too in a followup patch. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index c3f3a89cbbf6..3637c509956d 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -64,6 +64,29 @@ void fpu__init_check_bugs(void) check_fpu(); } +/* + * The earliest FPU detection code: + */ +static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +{ + unsigned long cr0; + u16 fsw, fcw; + + fsw = fcw = 0xffff; + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS | X86_CR0_EM); + write_cr0(cr0); + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); + + if (fsw == 0 && (fcw & 0x103f) == 0x003f) + set_cpu_cap(c, X86_FEATURE_FPU); + else + clear_cpu_cap(c, X86_FEATURE_FPU); +} + /* * Boot time FPU feature detection code: */ @@ -269,23 +292,7 @@ __setup("no387", no_387); */ void fpu__detect(struct cpuinfo_x86 *c) { - unsigned long cr0; - u16 fsw, fcw; - - fsw = fcw = 0xffff; - - cr0 = read_cr0(); - cr0 &= ~(X86_CR0_TS | X86_CR0_EM); - write_cr0(cr0); - - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); - - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); - else - clear_cpu_cap(c, X86_FEATURE_FPU); - + fpu__init_system_early_generic(c); fpu__init_system(); /* The final cr0 value is set later, in fpu_init() */ } -- cgit v1.2.1 From e83ab9ad97a702917cb018e6c5a7d1176ff95305 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 14:43:44 +0200 Subject: x86/fpu: Move !FPU check ingo fpu__init_system_early_generic() There's a !FPU related sanity check in fpu__init_cpu_generic(), which is executed on every CPU onlining - even though we should do this only once, and during system init. Move this check to fpu__init_system_early_generic(). Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 3637c509956d..69cdadd49ddf 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -85,6 +85,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_FPU); else clear_cpu_cap(c, X86_FEATURE_FPU); + +#ifndef CONFIG_MATH_EMULATION + if (!cpu_has_fpu) { + pr_emerg("No FPU found and no math emulation present\n"); + pr_emerg("Giving up\n"); + for (;;) + asm volatile("hlt"); + } +#endif } /* @@ -180,14 +189,6 @@ static void fpu__init_cpu_generic(void) unsigned long cr0; unsigned long cr4_mask = 0; -#ifndef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) { - pr_emerg("No FPU found and no math emulation present\n"); - pr_emerg("Giving up\n"); - for (;;) - asm volatile("hlt"); - } -#endif if (cpu_has_fxsr) cr4_mask |= X86_CR4_OSFXSR; if (cpu_has_xmm) -- cgit v1.2.1 From 0bf23f3d6cadb2f0961d0ea370371cf35480bcb5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 14:48:08 +0200 Subject: x86/fpu: Factor out FPU bug checks into fpu/bugs.c Create separate fpu/bugs.c code so that if we read generic FPU code we don't have to wade through all the bugcheck related code first. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/Makefile | 2 +- arch/x86/kernel/fpu/bugs.c | 64 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/init.c | 60 ----------------------------------------- 3 files changed, 65 insertions(+), 61 deletions(-) create mode 100644 arch/x86/kernel/fpu/bugs.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile index 50464a716b87..2020a2b7a597 100644 --- a/arch/x86/kernel/fpu/Makefile +++ b/arch/x86/kernel/fpu/Makefile @@ -2,4 +2,4 @@ # Build rules for the FPU support code: # -obj-y += init.o core.o xsave.o +obj-y += init.o bugs.o core.o xsave.o diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c new file mode 100644 index 000000000000..400a3d713fb2 --- /dev/null +++ b/arch/x86/kernel/fpu/bugs.c @@ -0,0 +1,64 @@ +/* + * x86 FPU bug checks: + */ +#include + +/* + * Boot time CPU/FPU FDIV bug detection code: + */ + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +static void __init check_fpu(void) +{ + s32 fdiv_bug; + + kernel_fpu_begin(); + + /* + * trap_init() enabled FXSR and company _before_ testing for FP + * problems here. + * + * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug + */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&fdiv_bug) + : "m" (*&x), "m" (*&y)); + + kernel_fpu_end(); + + if (fdiv_bug) { + set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); + pr_warn("Hmm, FPU with FDIV bug\n"); + } +} + +void fpu__init_check_bugs(void) +{ + /* + * kernel_fpu_begin/end() in check_fpu() relies on the patched + * alternative instructions. + */ + if (cpu_has_fpu) + check_fpu(); +} diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 69cdadd49ddf..63cd1703d25c 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -4,66 +4,6 @@ #include #include -/* - * Boot time CPU/FPU FDIV bug detection code: - */ - -static double __initdata x = 4195835.0; -static double __initdata y = 3145727.0; - -/* - * This used to check for exceptions.. - * However, it turns out that to support that, - * the XMM trap handlers basically had to - * be buggy. So let's have a correct XMM trap - * handler, and forget about printing out - * some status at boot. - * - * We should really only care about bugs here - * anyway. Not features. - */ -static void __init check_fpu(void) -{ - s32 fdiv_bug; - - kernel_fpu_begin(); - - /* - * trap_init() enabled FXSR and company _before_ testing for FP - * problems here. - * - * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug - */ - __asm__("fninit\n\t" - "fldl %1\n\t" - "fdivl %2\n\t" - "fmull %2\n\t" - "fldl %1\n\t" - "fsubp %%st,%%st(1)\n\t" - "fistpl %0\n\t" - "fwait\n\t" - "fninit" - : "=m" (*&fdiv_bug) - : "m" (*&x), "m" (*&y)); - - kernel_fpu_end(); - - if (fdiv_bug) { - set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); - pr_warn("Hmm, FPU with FDIV bug\n"); - } -} - -void fpu__init_check_bugs(void) -{ - /* - * kernel_fpu_begin/end() in check_fpu() relies on the patched - * alternative instructions. - */ - if (cpu_has_fpu) - check_fpu(); -} - /* * The earliest FPU detection code: */ -- cgit v1.2.1 From 71eb3c6d155c54fb74d2f95166162296a8f9af12 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 5 May 2015 10:54:04 +0200 Subject: x86/fpu: Make check_fpu() init ordering independent check_fpu() currently relies on being called early in the init sequence, when CR0::TS has not been set up yet. Save/restore CR0::TS across this function, to make it invariant to init ordering. This way we'll be able to move the generic FPU setup routines earlier in the init sequence. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/bugs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 400a3d713fb2..449b5f3f4925 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -23,8 +23,13 @@ static double __initdata y = 3145727.0; */ static void __init check_fpu(void) { + u32 cr0_saved; s32 fdiv_bug; + /* We might have CR0::TS set already, clear it: */ + cr0_saved = read_cr0(); + write_cr0(cr0_saved & ~X86_CR0_TS); + kernel_fpu_begin(); /* @@ -47,6 +52,8 @@ static void __init check_fpu(void) kernel_fpu_end(); + write_cr0(cr0_saved); + if (fdiv_bug) { set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); pr_warn("Hmm, FPU with FDIV bug\n"); -- cgit v1.2.1 From dd863880acd24a23d71576e402f999375d0b4b80 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 15:07:18 +0200 Subject: x86/fpu: Move fpu__init_system_early_generic() out of fpu__detect() Move the fpu__init_system_early_generic() call into fpu__init_system(), which hosts all the system init calls. Expose fpu__init_system() to other modules - this will be our main and only system init function. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 1 + arch/x86/kernel/fpu/init.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 4617eeb57004..5a1fa5bc2c27 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -41,6 +41,7 @@ extern void eager_fpu_init(void); extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system(struct cpuinfo_x86 *c); DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 63cd1703d25c..1155a98d8c1e 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -5,7 +5,10 @@ #include /* - * The earliest FPU detection code: + * The earliest FPU detection code. + * + * Set the X86_FEATURE_FPU CPU-capability bit based on + * trying to execute an actual sequence of FPU instructions: */ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) { @@ -200,8 +203,10 @@ static void fpu__init_system_ctx_switch(void) * Called on the boot CPU once per system bootup, to set up the initial FPU state that * is later cloned into all processes. */ -void fpu__init_system(void) +void fpu__init_system(struct cpuinfo_x86 *c) { + fpu__init_system_early_generic(c); + /* The FPU has to be operational for some of the later FPU init activities: */ fpu__init_cpu(); @@ -227,13 +232,8 @@ static int __init no_387(char *s) __setup("no387", no_387); -/* - * Set the X86_FEATURE_FPU CPU-capability bit based on - * trying to execute an actual sequence of FPU instructions: - */ void fpu__detect(struct cpuinfo_x86 *c) { - fpu__init_system_early_generic(c); - fpu__init_system(); + fpu__init_system(c); /* The final cr0 value is set later, in fpu_init() */ } -- cgit v1.2.1 From c66e3f28237199629358e9e5a76973c400a54041 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 15:12:44 +0200 Subject: x86/fpu: Remove the extra fpu__detect() layer Now that fpu__detect() has become an empty layer around fpu__init_system(), eliminate it and make fpu__init_system() the main system initialization routine. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/fpu/init.c | 6 ------ 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0f4add462697..b9e487499ae2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -167,7 +167,6 @@ extern const struct seq_operations cpuinfo_op; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); -extern void fpu__detect(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_boot_cpu(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d28f8ebc506d..d15610b0a4cf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -758,7 +758,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); - fpu__detect(c); + fpu__init_system(c); if (this_cpu->c_early_init) this_cpu->c_early_init(c); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 1155a98d8c1e..77b5d403de22 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -231,9 +231,3 @@ static int __init no_387(char *s) } __setup("no387", no_387); - -void fpu__detect(struct cpuinfo_x86 *c) -{ - fpu__init_system(c); - /* The final cr0 value is set later, in fpu_init() */ -} -- cgit v1.2.1 From 7638b74b56640d9c266b5b3622109128e30bdc1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 15:23:37 +0200 Subject: x86/fpu: Rename fpstate_xstate_init_size() to fpu__init_system_xstate_size_legacy() To bring it in line with the other init_system*() methods. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 77b5d403de22..a7ce5bcbcbab 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -83,7 +83,7 @@ static void fpu__init_system_generic(void) fpu__init_system_mxcsr(); } -static void fpstate_xstate_init_size(void) +static void fpu__init_system_xstate_size_legacy(void) { static bool on_boot_cpu = 1; @@ -218,7 +218,7 @@ void fpu__init_system(struct cpuinfo_x86 *c) clts(); fpu__init_system_generic(); - fpstate_xstate_init_size(); + fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); fpu__init_system_ctx_switch(); -- cgit v1.2.1 From 41e78410d89cc0e60d90e6a62a060d5d4084accb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 15:32:40 +0200 Subject: x86/fpu: Reorder init methods Reorder init methods in order of their relationship and usage, to form coherent blocks throughout the whole file. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 96 +++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index a7ce5bcbcbab..dbff1335229c 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -4,6 +4,46 @@ #include #include +static void fpu__init_cpu_ctx_switch(void) +{ + if (!cpu_has_eager_fpu) + stts(); + else + clts(); +} + +/* + * Initialize the registers found in all CPUs, CR0 and CR4: + */ +static void fpu__init_cpu_generic(void) +{ + unsigned long cr0; + unsigned long cr4_mask = 0; + + if (cpu_has_fxsr) + cr4_mask |= X86_CR4_OSFXSR; + if (cpu_has_xmm) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + cr4_set_bits(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!cpu_has_fpu) + cr0 |= X86_CR0_EM; + write_cr0(cr0); +} + +/* + * Enable all supported FPU features. Called when a CPU is brought online. + */ +void fpu__init_cpu(void) +{ + fpu__init_cpu_generic(); + fpu__init_cpu_xstate(); + fpu__init_cpu_ctx_switch(); +} + /* * The earliest FPU detection code. * @@ -44,9 +84,6 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; -unsigned int xstate_size; -EXPORT_SYMBOL_GPL(xstate_size); - static void fpu__init_system_mxcsr(void) { unsigned int mask = 0; @@ -83,6 +120,15 @@ static void fpu__init_system_generic(void) fpu__init_system_mxcsr(); } +unsigned int xstate_size; +EXPORT_SYMBOL_GPL(xstate_size); + +/* + * Set up the xstate_size based on the legacy FPU context size. + * + * We set this up first, and later it will be overwritten by + * fpu__init_system_xstate() if the CPU knows about xstates. + */ static void fpu__init_system_xstate_size_legacy(void) { static bool on_boot_cpu = 1; @@ -112,50 +158,6 @@ static void fpu__init_system_xstate_size_legacy(void) } } -/* - * Initialize the TS bit in CR0 according to the style of context-switches - * we are using: - */ -static void fpu__init_cpu_ctx_switch(void) -{ - if (!cpu_has_eager_fpu) - stts(); - else - clts(); -} - -/* - * Initialize the registers found in all CPUs, CR0 and CR4: - */ -static void fpu__init_cpu_generic(void) -{ - unsigned long cr0; - unsigned long cr4_mask = 0; - - if (cpu_has_fxsr) - cr4_mask |= X86_CR4_OSFXSR; - if (cpu_has_xmm) - cr4_mask |= X86_CR4_OSXMMEXCPT; - if (cr4_mask) - cr4_set_bits(cr4_mask); - - cr0 = read_cr0(); - cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ - if (!cpu_has_fpu) - cr0 |= X86_CR0_EM; - write_cr0(cr0); -} - -/* - * Enable all supported FPU features. Called when a CPU is brought online. - */ -void fpu__init_cpu(void) -{ - fpu__init_cpu_generic(); - fpu__init_cpu_xstate(); - fpu__init_cpu_ctx_switch(); -} - static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; static int __init eager_fpu_setup(char *s) -- cgit v1.2.1 From ae02679c566fb1c2f76d3c6dffef977a9d69474a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 15:36:46 +0200 Subject: x86/fpu: Add more comments to the FPU init code Extend the comments of the FPU init code, and fix old ones. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 70 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index dbff1335229c..7ae5a62918c7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -1,9 +1,13 @@ /* - * x86 FPU boot time init code + * x86 FPU boot time init code: */ #include #include +/* + * Initialize the TS bit in CR0 according to the style of context-switches + * we are using: + */ static void fpu__init_cpu_ctx_switch(void) { if (!cpu_has_eager_fpu) @@ -35,7 +39,7 @@ static void fpu__init_cpu_generic(void) } /* - * Enable all supported FPU features. Called when a CPU is brought online. + * Enable all supported FPU features. Called when a CPU is brought online: */ void fpu__init_cpu(void) { @@ -71,8 +75,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) #ifndef CONFIG_MATH_EMULATION if (!cpu_has_fpu) { - pr_emerg("No FPU found and no math emulation present\n"); - pr_emerg("Giving up\n"); + pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); for (;;) asm volatile("hlt"); } @@ -120,6 +123,12 @@ static void fpu__init_system_generic(void) fpu__init_system_mxcsr(); } +/* + * Size of the FPU context state. All tasks in the system use the + * same context size, regardless of what portion they use. + * This is inherent to the XSAVE architecture which puts all state + * components into a single, continuous memory block: + */ unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); @@ -158,6 +167,37 @@ static void fpu__init_system_xstate_size_legacy(void) } } +/* + * FPU context switching strategies: + * + * Against popular belief, we don't do lazy FPU saves, due to the + * task migration complications it brings on SMP - we only do + * lazy FPU restores. + * + * 'lazy' is the traditional strategy, which is based on setting + * CR0::TS to 1 during context-switch (instead of doing a full + * restore of the FPU state), which causes the first FPU instruction + * after the context switch (whenever it is executed) to fault - at + * which point we lazily restore the FPU state into FPU registers. + * + * Tasks are of course under no obligation to execute FPU instructions, + * so it can easily happen that another context-switch occurs without + * a single FPU instruction being executed. If we eventually switch + * back to the original task (that still owns the FPU) then we have + * not only saved the restores along the way, but we also have the + * FPU ready to be used for the original task. + * + * 'eager' switching is used on modern CPUs, there we switch the FPU + * state during every context switch, regardless of whether the task + * has used FPU instructions in that time slice or not. This is done + * because modern FPU context saving instructions are able to optimize + * state saving and restoration in hardware: they can detect both + * unused and untouched FPU state and optimize accordingly. + * + * [ Note that even in 'lazy' mode we might optimize context switches + * to use 'eager' restores, if we detect that a task is using the FPU + * frequently. See the fpu->counter logic in fpu/internal.h for that. ] + */ static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; static int __init eager_fpu_setup(char *s) @@ -173,8 +213,7 @@ static int __init eager_fpu_setup(char *s) __setup("eagerfpu=", eager_fpu_setup); /* - * setup_init_fpu_buf() is __init and it is OK to call it here because - * init_xstate_ctx will be unset only once during boot. + * Pick the FPU context switching strategy: */ static void fpu__init_system_ctx_switch(void) { @@ -202,20 +241,24 @@ static void fpu__init_system_ctx_switch(void) } /* - * Called on the boot CPU once per system bootup, to set up the initial FPU state that - * is later cloned into all processes. + * Called on the boot CPU once per system bootup, to set up the initial + * FPU state that is later cloned into all processes: */ void fpu__init_system(struct cpuinfo_x86 *c) { fpu__init_system_early_generic(c); - /* The FPU has to be operational for some of the later FPU init activities: */ + /* + * The FPU has to be operational for some of the + * later FPU init activities: + */ fpu__init_cpu(); /* - * But don't leave CR0::TS set yet, as some of the FPU setup methods depend - * on being able to execute FPU instructions that will fault on a set TS, - * such as the FXSAVE in fpu__init_system_mxcsr(). + * But don't leave CR0::TS set yet, as some of the FPU setup + * methods depend on being able to execute FPU instructions + * that will fault on a set TS, such as the FXSAVE in + * fpu__init_system_mxcsr(). */ clts(); @@ -226,6 +269,9 @@ void fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_ctx_switch(); } +/* + * Boot parameter to turn off FPU support and fall back to math-emu: + */ static int __init no_387(char *s) { setup_clear_cpu_cap(X86_FEATURE_FPU); -- cgit v1.2.1 From e229537543fba3de4026cb6e12b75946d9f3430f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 16:43:43 +0200 Subject: x86/fpu: Move fpu__save() to fpu/internals.h It's an internal method, not a driver API, so move it from fpu/api.h to fpu/internal.h. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/api.h | 2 -- arch/x86/include/asm/fpu/internal.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 4ca745c0d92e..eeac3766d8e5 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -99,6 +99,4 @@ static inline int user_has_fpu(void) return current->thread.fpu.fpregs_active; } -extern void fpu__save(struct fpu *fpu); - #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 5a1fa5bc2c27..0c8c812d23b4 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -290,6 +290,8 @@ static inline int fpu_save_init(struct fpu *fpu) return 1; } +extern void fpu__save(struct fpu *fpu); + static inline int fpu_restore_checking(struct fpu *fpu) { if (use_xsave()) -- cgit v1.2.1 From d63e79b114c0208bc2b7712c879568e180909d60 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 12:07:18 +0200 Subject: x86/fpu: Uninline kernel_fpu_begin()/end() Both inline functions call an inline function unconditionally, so we already pay the function call based clobbering cost. Uninline them. This saves quite a bit of code in various performance sensitive code paths: text data bss dec hex filename 13321334 2569888 1634304 17525526 10b6b16 vmlinux.before 13320246 2569888 1634304 17524438 10b66d6 vmlinux.after Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/api.h | 15 ++------------- arch/x86/kernel/fpu/core.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index eeac3766d8e5..d4ab9e3af234 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -39,19 +39,8 @@ extern bool irq_fpu_usable(void); */ extern void __kernel_fpu_begin(void); extern void __kernel_fpu_end(void); - -static inline void kernel_fpu_begin(void) -{ - preempt_disable(); - WARN_ON_ONCE(!irq_fpu_usable()); - __kernel_fpu_begin(); -} - -static inline void kernel_fpu_end(void) -{ - __kernel_fpu_end(); - preempt_enable(); -} +extern void kernel_fpu_begin(void); +extern void kernel_fpu_end(void); /* * Some instructions like VIA's padlock instructions generate a spurious diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3d2ec4bd7f8c..8323a2a5241c 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -125,6 +125,21 @@ void __kernel_fpu_end(void) } EXPORT_SYMBOL(__kernel_fpu_end); +void kernel_fpu_begin(void) +{ + preempt_disable(); + WARN_ON_ONCE(!irq_fpu_usable()); + __kernel_fpu_begin(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + __kernel_fpu_end(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); + static void __save_fpu(struct fpu *fpu) { if (use_xsave()) { -- cgit v1.2.1 From 952f07ecbd4d9bac77c003ba136f8ee8ce631591 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 16:56:05 +0200 Subject: x86/fpu: Move various internal function prototypes to fpu/internal.h There are a number of FPU internal function prototypes and an inline function in fpu/api.h, mostly placed so historically as the code grew over the years. Move them over into fpu/internal.h where they belong. (Add sched.h include to stackprotector.h which incorrectly relied on getting it from fpu/api.h.) fpu/api.h is now a pure file that only contains FPU APIs intended for driver use. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/api.h | 31 +------------------------------ arch/x86/include/asm/fpu/internal.h | 25 +++++++++++++++++++++++++ arch/x86/include/asm/stackprotector.h | 2 ++ arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- arch/x86/power/cpu.c | 1 + 7 files changed, 32 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index d4ab9e3af234..0c713455fc63 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -10,23 +10,8 @@ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H -#include #include -struct pt_regs; -struct user_i387_struct; - -extern int fpstate_alloc_init(struct fpu *fpu); -extern void fpstate_init(struct fpu *fpu); -extern void fpu__clear(struct task_struct *tsk); - -extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); -extern void fpu__restore(void); -extern void fpu__init_check_bugs(void); -extern void fpu__resume_cpu(void); - -extern bool irq_fpu_usable(void); - /* * Careful: __kernel_fpu_begin/end() must be called with preempt disabled * and they don't touch the preempt state on their own. @@ -41,6 +26,7 @@ extern void __kernel_fpu_begin(void); extern void __kernel_fpu_end(void); extern void kernel_fpu_begin(void); extern void kernel_fpu_end(void); +extern bool irq_fpu_usable(void); /* * Some instructions like VIA's padlock instructions generate a spurious @@ -73,19 +59,4 @@ static inline void irq_ts_restore(int TS_state) stts(); } -/* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int user_has_fpu(void) -{ - return current->thread.fpu.fpregs_active; -} - #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0c8c812d23b4..89c6ec80c1ac 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -43,6 +44,15 @@ extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); +extern int fpstate_alloc_init(struct fpu *fpu); +extern void fpstate_init(struct fpu *fpu); +extern void fpu__clear(struct task_struct *tsk); + +extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +extern void fpu__restore(void); +extern void fpu__init_check_bugs(void); +extern void fpu__resume_cpu(void); + DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); extern void convert_from_fxsr(struct user_i387_ia32_struct *env, @@ -334,6 +344,21 @@ static inline void __fpregs_activate(struct fpu *fpu) this_cpu_write(fpu_fpregs_owner_ctx, fpu); } +/* + * The question "does this thread have fpu access?" + * is slightly racy, since preemption could come in + * and revoke it immediately after the test. + * + * However, even in that very unlikely scenario, + * we can just assume we have FPU access - typically + * to save the FP state - we'll just take a #NM + * fault and get the FPU access back. + */ +static inline int user_has_fpu(void) +{ + return current->thread.fpu.fpregs_active; +} + /* * Encapsulate the CR0.TS handling together with the * software flag. diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 6a998598f172..c2e00bb2a136 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -39,7 +39,9 @@ #include #include #include + #include +#include /* * 24 byte read-only segment initializer for stack canary. Linker diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 29dd74318ec6..bd17db15a2c1 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5cb738a18ca3..f93ae71416e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 3bb4c6a24ea5..cf843855e4f6 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include "fpu_system.h" #include "fpu_emu.h" diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index ad0ce6b70fac..0d7dd1f5ac36 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include -- cgit v1.2.1 From 910665882fc00dc5bab0f846fe707174ff289615 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Apr 2015 16:57:55 +0200 Subject: x86/fpu: Uninline the irq_ts_save()/restore() functions Especially the irq_ts_save() function is pretty bloaty, generating over a dozen instructions, so uninline them. Even though the API is used rarely, the space savings are measurable: text data bss dec hex filename 13331995 2572920 1634304 17539219 10ba093 vmlinux.before 13331739 2572920 1634304 17538963 10b9f93 vmlinux.after ( This also allows the removal of an include file inclusion from fpu/api.h, speeding up the kernel build slightly. ) Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/api.h | 27 ++------------------------- arch/x86/kernel/fpu/core.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 0c713455fc63..62035cc1d961 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -10,8 +10,6 @@ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H -#include - /* * Careful: __kernel_fpu_begin/end() must be called with preempt disabled * and they don't touch the preempt state on their own. @@ -35,28 +33,7 @@ extern bool irq_fpu_usable(void); * in interrupt context interacting wrongly with other user/kernel fpu usage, we * should use them only in the context of irq_ts_save/restore() */ -static inline int irq_ts_save(void) -{ - /* - * If in process context and not atomic, we can take a spurious DNA fault. - * Otherwise, doing clts() in process context requires disabling preemption - * or some heavy lifting like kernel_fpu_begin() - */ - if (!in_atomic()) - return 0; - - if (read_cr0() & X86_CR0_TS) { - clts(); - return 1; - } - - return 0; -} - -static inline void irq_ts_restore(int TS_state) -{ - if (TS_state) - stts(); -} +extern int irq_ts_save(void); +extern void irq_ts_restore(int TS_state); #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8323a2a5241c..b20d4ea8e132 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -6,6 +6,7 @@ * Gareth Hughes , May 2000 */ #include +#include /* * Track whether the kernel is using the FPU state @@ -140,6 +141,35 @@ void kernel_fpu_end(void) } EXPORT_SYMBOL_GPL(kernel_fpu_end); +/* + * CR0::TS save/restore functions: + */ +int irq_ts_save(void) +{ + /* + * If in process context and not atomic, we can take a spurious DNA fault. + * Otherwise, doing clts() in process context requires disabling preemption + * or some heavy lifting like kernel_fpu_begin() + */ + if (!in_atomic()) + return 0; + + if (read_cr0() & X86_CR0_TS) { + clts(); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(irq_ts_save); + +void irq_ts_restore(int TS_state) +{ + if (TS_state) + stts(); +} +EXPORT_SYMBOL_GPL(irq_ts_restore); + static void __save_fpu(struct fpu *fpu) { if (use_xsave()) { -- cgit v1.2.1 From 4f83634710a1a7024b8acaa3b589dc5d8ca03ab0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 02:53:16 +0200 Subject: x86/fpu: Rename fpu_save_init() to copy_fpregs_to_fpstate() So fpu_save_init() is a historic name that got its name when the only way the FPU state was FNSAVE, which cleared (well, destroyed) the FPU state after saving it. Nowadays the name is misleading, because ever since the introduction of FXSAVE (and more modern FPU saving instructions) the 'we need to reload the FPU state' part is only true if there's a pending FPU exception [*], which is almost never the case. So rename it to copy_fpregs_to_fpstate() to make it clear what's happening. Also add a few comments about why we cannot keep registers in certain cases. Also clean up the control flow a bit, to make it more apparent when we are dropping/keeping FP registers, and to optimize the common case (of keeping fpregs) some more. [*] Probably not true anymore, modern instructions always leave the FPU state intact, even if exceptions are pending: because pending FP exceptions are posted on the next FP instruction, not asynchronously. They were truly asynchronous back in the IRQ13 case, and we had to synchronize with them, but that code is not working anymore: we don't have IRQ13 mapped in the IDT anymore. But a cleanup patch is obviously not the place to change subtle behavior. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 34 ++++++++++++++++++++++++---------- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/mm/mpx.c | 2 +- 5 files changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 89c6ec80c1ac..11055f51e67a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -265,9 +265,15 @@ static inline void fpu_fxsave(struct fpu *fpu) /* * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact. + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. (Note the FIXME below.) */ -static inline int fpu_save_init(struct fpu *fpu) +static inline int copy_fpregs_to_fpstate(struct fpu *fpu) { if (use_xsave()) { xsave_state(&fpu->state->xsave); @@ -276,13 +282,16 @@ static inline int fpu_save_init(struct fpu *fpu) * xsave header may indicate the init state of the FP. */ if (!(fpu->state->xsave.header.xfeatures & XSTATE_FP)) - return 1; - } else if (use_fxsr()) { - fpu_fxsave(fpu); + goto keep_fpregs; } else { - asm volatile("fnsave %[fx]; fwait" - : [fx] "=m" (fpu->state->fsave)); - return 0; + if (use_fxsr()) { + fpu_fxsave(fpu); + } else { + /* FNSAVE always clears FPU registers: */ + asm volatile("fnsave %[fx]; fwait" + : [fx] "=m" (fpu->state->fsave)); + goto drop_fpregs; + } } /* @@ -295,9 +304,14 @@ static inline int fpu_save_init(struct fpu *fpu) */ if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { asm volatile("fnclex"); - return 0; + goto drop_fpregs; } + +keep_fpregs: return 1; + +drop_fpregs: + return 0; } extern void fpu__save(struct fpu *fpu); @@ -448,7 +462,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) (use_eager_fpu() || new_fpu->counter > 5); if (old_fpu->fpregs_active) { - if (!fpu_save_init(old_fpu)) + if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else old_fpu->last_cpu = cpu; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b20d4ea8e132..ca88608a62a5 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -102,7 +102,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); if (fpu->fpregs_active) { - fpu_save_init(fpu); + copy_fpregs_to_fpstate(fpu); } else { this_cpu_write(fpu_fpregs_owner_ctx, NULL); if (!use_eager_fpu()) @@ -196,7 +196,7 @@ void fpu__save(struct fpu *fpu) if (use_eager_fpu()) { __save_fpu(fpu); } else { - fpu_save_init(fpu); + copy_fpregs_to_fpstate(fpu); fpregs_deactivate(fpu); } } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a65586edbb57..f028f1da3480 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -395,7 +395,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) * It is not directly accessible, though, so we need to * do an xsave and then pull it out of the xsave buffer. */ - fpu_save_init(&tsk->thread.fpu); + copy_fpregs_to_fpstate(&tsk->thread.fpu); xsave_buf = &(tsk->thread.fpu.state->xsave); bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); if (!bndcsr) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b58b9397098..d90bf4afa2b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7058,7 +7058,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 0; - fpu_save_init(&vcpu->arch.guest_fpu); + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 5563be313fd6..3287215be60a 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -357,7 +357,7 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk) * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ - fpu_save_init(&tsk->thread.fpu); + copy_fpregs_to_fpstate(&tsk->thread.fpu); bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; -- cgit v1.2.1 From 1bc6b056d8f90f0a710ff1201964c1fffc9ddd3c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 03:32:18 +0200 Subject: x86/fpu: Optimize copy_fpregs_to_fpstate() by removing the FNCLEX synchronization with FP exceptions So we have the following ancient code in copy_fpregs_to_fpstate(): if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { asm volatile("fnclex"); goto drop_fpregs; } which clears pending FPU exceptions and then drops registers, which causes the next FP instruction of the saved context to re-load the saved FPU state, with all pending exceptions marked properly, and will re-start the exception handling mechanism in the hardware. Since FPU exceptions are always issued on instruction boundaries, in particular on the next FP instruction following the exception generating instruction, there's no fear of getting an FP exception asynchronously. They were truly asynchronous back in the IRQ13 days, when the FPU was a weird and expensive co-processor that did its own processing, and we had to synchronize with them, but that code is not working anymore: we don't have IRQ13 mapped in the IDT anymore. With the introduction of optimized XSAVE support there's a new complication: if the xstate features bit indicates that a particular state component is unused (in 'init state'), then the hardware does not guarantee that the XSAVE (et al) instruction keeps the underlying FPU state image in memory valid and current. In practice this means that the hardware won't write it, and the exceptions flag in the state might be an older version, with it still being set. This meant that we had to check the xfeatures flag as well, adding another memory load and branch to a critical hot path of the scheduler. So optimize all this by removing both the old quirk and the new check, and straight-line optimizing the most common cases with likely() hints. Quite a bit of code gets removed this way: arch/x86/kernel/process_64.o: text data bss dec filename 5484 8 0 5492 process_64.o.before 5416 8 0 5424 process_64.o.after Now there's also a chance that some weird behavior or erratum was masked by our IRQ13 handling quirk (or that I misunderstood the nature of the quirk), and that this change triggers some badness. There's no real good way to protect against that possibility other than keeping this change well isolated, well commented and well bisectable. If you bisect a weird (or not so weird) breakage to this commit then please let us know! Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 40 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 11055f51e67a..10663b02ee22 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -271,46 +271,26 @@ static inline void fpu_fxsave(struct fpu *fpu) * The legacy FNSAVE instruction cleared all FPU state * unconditionally, so registers are essentially destroyed. * Modern FPU state can be kept in registers, if there are - * no pending FP exceptions. (Note the FIXME below.) + * no pending FP exceptions. */ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) { - if (use_xsave()) { + if (likely(use_xsave())) { xsave_state(&fpu->state->xsave); + return 1; + } - /* - * xsave header may indicate the init state of the FP. - */ - if (!(fpu->state->xsave.header.xfeatures & XSTATE_FP)) - goto keep_fpregs; - } else { - if (use_fxsr()) { - fpu_fxsave(fpu); - } else { - /* FNSAVE always clears FPU registers: */ - asm volatile("fnsave %[fx]; fwait" - : [fx] "=m" (fpu->state->fsave)); - goto drop_fpregs; - } + if (likely(use_fxsr())) { + fpu_fxsave(fpu); + return 1; } /* - * If exceptions are pending, we need to clear them so - * that we don't randomly get exceptions later. - * - * FIXME! Is this perhaps only true for the old-style - * irq13 case? Maybe we could leave the x87 state - * intact otherwise? + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to mark them inactive: */ - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { - asm volatile("fnclex"); - goto drop_fpregs; - } - -keep_fpregs: - return 1; + asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state->fsave)); -drop_fpregs: return 0; } -- cgit v1.2.1 From 7366ed771f6ed95e4c4525c335722888a83b4b6c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 04:19:39 +0200 Subject: x86/fpu: Simplify FPU handling by embedding the fpstate in task_struct (again) So 6 years ago we made the FPU fpstate dynamically allocated: aa283f49276e ("x86, fpu: lazy allocation of FPU area - v5") 61c4628b5386 ("x86, fpu: split FPU state from task struct - v5") In hindsight this was a mistake: - it complicated context allocation failure handling, such as: /* kthread execs. TODO: cleanup this horror. */ if (WARN_ON(fpstate_alloc_init(fpu))) force_sig(SIGKILL, tsk); - it caused us to enable irqs in fpu__restore(): local_irq_enable(); /* * does a slab alloc which can sleep */ if (fpstate_alloc_init(fpu)) { /* * ran out of memory! */ do_group_exit(SIGKILL); return; } local_irq_disable(); - it (slightly) slowed down task creation/destruction by adding slab allocation/free pattens. - it made access to context contents (slightly) slower by adding one more pointer dereference. The motivation for the dynamic allocation was two-fold: - reduce memory consumption by non-FPU tasks - allocate and handle only the necessary amount of context for various XSAVE processors that have varying hardware frame sizes. These days, with glibc using SSE memcpy by default and GCC optimizing for SSE/AVX by default, the scope of FPU using apps on an x86 system is much larger than it was 6 years ago. For example on a freshly installed Fedora 21 desktop system, with a recent kernel, all non-kthread tasks have used the FPU shortly after bootup. Also, even modern embedded x86 CPUs try to support the latest vector instruction set - so they'll too often use the larger xstate frame sizes. So remove the dynamic allocation complication by embedding the FPU fpstate in task_struct again. This should make the FPU a lot more accessible to all sorts of atomic contexts. We could still optimize for the xstate frame size in the future, by moving the state structure to the last element of task_struct, and allocating only a part of that. This change is kept minimal by still keeping the ctx_alloc()/free() routines (that now do nothing substantial) - we'll remove them in the following patches. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 34 ++++++++++++------------ arch/x86/include/asm/fpu/types.h | 2 +- arch/x86/kernel/fpu/core.c | 52 ++++++++++++++----------------------- arch/x86/kernel/fpu/xsave.c | 12 ++++----- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/x86.c | 14 +++++----- arch/x86/math-emu/fpu_aux.c | 2 +- arch/x86/math-emu/fpu_entry.c | 4 +-- arch/x86/math-emu/fpu_system.h | 2 +- arch/x86/mm/mpx.c | 2 +- 10 files changed, 57 insertions(+), 69 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 10663b02ee22..4ce830fb3f31 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -232,9 +232,9 @@ static inline int frstor_user(struct i387_fsave_struct __user *fx) static inline void fpu_fxsave(struct fpu *fpu) { if (config_enabled(CONFIG_X86_32)) - asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); + asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); else if (config_enabled(CONFIG_AS_FXSAVEQ)) - asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave)); + asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); else { /* Using "rex64; fxsave %0" is broken because, if the memory * operand uses any extended registers for addressing, a second @@ -251,15 +251,15 @@ static inline void fpu_fxsave(struct fpu *fpu) * an extended register is needed for addressing (fix submitted * to mainline 2005-11-21). * - * asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave)); + * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave)); * * This, however, we can work around by forcing the compiler to * select an addressing mode that doesn't require extended * registers. */ asm volatile( "rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); + : "=m" (fpu->state.fxsave) + : [fx] "R" (&fpu->state.fxsave)); } } @@ -276,7 +276,7 @@ static inline void fpu_fxsave(struct fpu *fpu) static inline int copy_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { - xsave_state(&fpu->state->xsave); + xsave_state(&fpu->state.xsave); return 1; } @@ -289,7 +289,7 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to mark them inactive: */ - asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state->fsave)); + asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state.fsave)); return 0; } @@ -299,11 +299,11 @@ extern void fpu__save(struct fpu *fpu); static inline int fpu_restore_checking(struct fpu *fpu) { if (use_xsave()) - return fpu_xrstor_checking(&fpu->state->xsave); + return fpu_xrstor_checking(&fpu->state.xsave); else if (use_fxsr()) - return fxrstor_checking(&fpu->state->fxsave); + return fxrstor_checking(&fpu->state.fxsave); else - return frstor_checking(&fpu->state->fsave); + return frstor_checking(&fpu->state.fsave); } static inline int restore_fpu_checking(struct fpu *fpu) @@ -454,7 +454,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) if (fpu.preload) { new_fpu->counter++; __fpregs_activate(new_fpu); - prefetch(new_fpu->state); + prefetch(&new_fpu->state); } else if (!use_eager_fpu()) stts(); } else { @@ -465,7 +465,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else - prefetch(new_fpu->state); + prefetch(&new_fpu->state); fpregs_activate(new_fpu); } } @@ -534,25 +534,25 @@ static inline void user_fpu_begin(void) static inline unsigned short get_fpu_cwd(struct task_struct *tsk) { if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.cwd; + return tsk->thread.fpu.state.fxsave.cwd; } else { - return (unsigned short)tsk->thread.fpu.state->fsave.cwd; + return (unsigned short)tsk->thread.fpu.state.fsave.cwd; } } static inline unsigned short get_fpu_swd(struct task_struct *tsk) { if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.swd; + return tsk->thread.fpu.state.fxsave.swd; } else { - return (unsigned short)tsk->thread.fpu.state->fsave.swd; + return (unsigned short)tsk->thread.fpu.state.fsave.swd; } } static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) { if (cpu_has_xmm) { - return tsk->thread.fpu.state->fxsave.mxcsr; + return tsk->thread.fpu.state.fxsave.mxcsr; } else { return MXCSR_DEFAULT; } diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 231a8f53b2f8..3a15ac6032eb 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -143,7 +143,7 @@ struct fpu { unsigned int last_cpu; unsigned int fpregs_active; - union thread_xstate *state; + union thread_xstate state; /* * This counter contains the number of consecutive context switches * during which the FPU stays used. If this is over a threshold, the diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ca88608a62a5..1697a9a34ff0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -174,9 +174,9 @@ static void __save_fpu(struct fpu *fpu) { if (use_xsave()) { if (unlikely(system_state == SYSTEM_BOOTING)) - xsave_state_booting(&fpu->state->xsave); + xsave_state_booting(&fpu->state.xsave); else - xsave_state(&fpu->state->xsave); + xsave_state(&fpu->state.xsave); } else { fpu_fxsave(fpu); } @@ -207,16 +207,16 @@ EXPORT_SYMBOL_GPL(fpu__save); void fpstate_init(struct fpu *fpu) { if (!cpu_has_fpu) { - finit_soft_fpu(&fpu->state->soft); + finit_soft_fpu(&fpu->state.soft); return; } - memset(fpu->state, 0, xstate_size); + memset(&fpu->state, 0, xstate_size); if (cpu_has_fxsr) { - fx_finit(&fpu->state->fxsave); + fx_finit(&fpu->state.fxsave); } else { - struct i387_fsave_struct *fp = &fpu->state->fsave; + struct i387_fsave_struct *fp = &fpu->state.fsave; fp->cwd = 0xffff037fu; fp->swd = 0xffff0000u; fp->twd = 0xffffffffu; @@ -241,15 +241,8 @@ void fpstate_cache_init(void) int fpstate_alloc(struct fpu *fpu) { - if (fpu->state) - return 0; - - fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); - if (!fpu->state) - return -ENOMEM; - /* The CPU requires the FPU state to be aligned to 16 byte boundaries: */ - WARN_ON((unsigned long)fpu->state & 15); + WARN_ON((unsigned long)&fpu->state & 15); return 0; } @@ -257,10 +250,6 @@ EXPORT_SYMBOL_GPL(fpstate_alloc); void fpstate_free(struct fpu *fpu) { - if (fpu->state) { - kmem_cache_free(task_xstate_cachep, fpu->state); - fpu->state = NULL; - } } EXPORT_SYMBOL_GPL(fpstate_free); @@ -277,11 +266,11 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) WARN_ON(src_fpu != ¤t->thread.fpu); if (use_eager_fpu()) { - memset(&dst_fpu->state->xsave, 0, xstate_size); + memset(&dst_fpu->state.xsave, 0, xstate_size); __save_fpu(dst_fpu); } else { fpu__save(src_fpu); - memcpy(dst_fpu->state, src_fpu->state, xstate_size); + memcpy(&dst_fpu->state, &src_fpu->state, xstate_size); } } @@ -289,7 +278,6 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; - dst_fpu->state = NULL; dst_fpu->last_cpu = -1; if (src_fpu->fpstate_active) { @@ -483,7 +471,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, sanitize_i387_state(target); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fpu->state->fxsave, 0, -1); + &fpu->state.fxsave, 0, -1); } int xfpregs_set(struct task_struct *target, const struct user_regset *regset, @@ -503,19 +491,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, sanitize_i387_state(target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &fpu->state->fxsave, 0, -1); + &fpu->state.fxsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. */ - fpu->state->fxsave.mxcsr &= mxcsr_feature_mask; + fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; /* * update the header bits in the xsave header, indicating the * presence of FP and SSE state. */ if (cpu_has_xsave) - fpu->state->xsave.header.xfeatures |= XSTATE_FPSSE; + fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE; return ret; } @@ -535,7 +523,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - xsave = &fpu->state->xsave; + xsave = &fpu->state.xsave; /* * Copy the 48bytes defined by the software first into the xstate @@ -566,7 +554,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - xsave = &fpu->state->xsave; + xsave = &fpu->state.xsave; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); /* @@ -657,7 +645,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -695,7 +683,7 @@ void convert_to_fxsr(struct task_struct *tsk, const struct user_i387_ia32_struct *env) { - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -736,7 +724,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fpu->state->fsave, 0, + &fpu->state.fsave, 0, -1); sanitize_i387_state(target); @@ -770,7 +758,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &fpu->state->fsave, 0, + &fpu->state.fsave, 0, -1); if (pos > 0 || count < sizeof(env)) @@ -785,7 +773,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP. */ if (cpu_has_xsave) - fpu->state->xsave.header.xfeatures |= XSTATE_FP; + fpu->state.xsave.header.xfeatures |= XSTATE_FP; return ret; } diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index a23236358fb0..c7d48eb0a194 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -44,14 +44,14 @@ static unsigned int xfeatures_nr; */ void __sanitize_i387_state(struct task_struct *tsk) { - struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state.fxsave; int feature_bit; u64 xfeatures; if (!fx) return; - xfeatures = tsk->thread.fpu.state->xsave.header.xfeatures; + xfeatures = tsk->thread.fpu.state.xsave.header.xfeatures; /* * None of the feature bits are in init state. So nothing else @@ -147,7 +147,7 @@ static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; struct user_i387_ia32_struct env; struct _fpstate_ia32 __user *fp = buf; @@ -245,7 +245,7 @@ static inline int save_user_xstate(struct xsave_struct __user *buf) */ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) { - struct xsave_struct *xsave = ¤t->thread.fpu.state->xsave; + struct xsave_struct *xsave = ¤t->thread.fpu.state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -288,7 +288,7 @@ sanitize_restored_xstate(struct task_struct *tsk, struct user_i387_ia32_struct *ia32_env, u64 xfeatures, int fx_only) { - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; struct xstate_header *header = &xsave->header; if (use_xsave()) { @@ -402,7 +402,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ drop_fpu(fpu); - if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || + if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(fpu); err = -1; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f028f1da3480..48dfcd9ed351 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -396,7 +396,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) * do an xsave and then pull it out of the xsave buffer. */ copy_fpregs_to_fpstate(&tsk->thread.fpu); - xsave_buf = &(tsk->thread.fpu.state->xsave); + xsave_buf = &(tsk->thread.fpu.state.xsave); bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); if (!bndcsr) goto exit_trap; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d90bf4afa2b0..8bb0de5bf9c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3196,7 +3196,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { - struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; + struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state.xsave; u64 xstate_bv = xsave->header.xfeatures; u64 valid; @@ -3232,7 +3232,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { - struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; + struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; @@ -3277,7 +3277,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->fxsave, + &vcpu->arch.guest_fpu.state.fxsave, sizeof(struct i387_fxsave_struct)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XSTATE_FPSSE; @@ -3302,7 +3302,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state->fxsave, + memcpy(&vcpu->arch.guest_fpu.state.fxsave, guest_xsave->region, sizeof(struct i387_fxsave_struct)); } return 0; @@ -6973,7 +6973,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + &vcpu->arch.guest_fpu.state.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -6990,7 +6990,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + &vcpu->arch.guest_fpu.state.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -7014,7 +7014,7 @@ int fx_init(struct kvm_vcpu *vcpu) fpstate_init(&vcpu->arch.guest_fpu); if (cpu_has_xsaves) - vcpu->arch.guest_fpu.state->xsave.header.xcomp_bv = + vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index dc8adad10a2f..7562341ce299 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft) void finit(void) { - finit_soft_fpu(¤t->thread.fpu.state->soft); + finit_soft_fpu(¤t->thread.fpu.state.soft); } /* diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index cf843855e4f6..5e003704ebfa 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -683,7 +683,7 @@ int fpregs_soft_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct i387_soft_struct *s387 = &target->thread.fpu.state->soft; + struct i387_soft_struct *s387 = &target->thread.fpu.state.soft; void *space = s387->st_space; int ret; int offset, other, i, tags, regnr, tag, newtop; @@ -735,7 +735,7 @@ int fpregs_soft_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - struct i387_soft_struct *s387 = &target->thread.fpu.state->soft; + struct i387_soft_struct *s387 = &target->thread.fpu.state.soft; const void *space = s387->st_space; int ret; int offset = (S387->ftop & 7) * 10, other = 80 - offset; diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 2c614410a5f3..9ccecb61a4fa 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -31,7 +31,7 @@ #define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ == (1 << 10)) -#define I387 (current->thread.fpu.state) +#define I387 (¤t->thread.fpu.state) #define FPU_info (I387->soft.info) #define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 3287215be60a..ea5b367b63a9 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -358,7 +358,7 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk) * only accessible if we first do an xsave. */ copy_fpregs_to_fpstate(&tsk->thread.fpu); - bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); + bndcsr = get_xsave_addr(&tsk->thread.fpu.state.xsave, XSTATE_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; -- cgit v1.2.1 From c4d6ee6e2e52ec604cc1d76877791f8e8f5c79b5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 05:52:40 +0200 Subject: x86/fpu: Remove failure paths from fpstate-alloc low level functions Now that we always allocate the FPU context as part of task_struct there's no need for separate allocations - remove them and their primary failure handling code. ( Note that there's still secondary error codes that have become superfluous, those will be removed in separate patches. ) Move the somewhat misplaced setup_xstate_comp() call to the core. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 --- arch/x86/kernel/fpu/core.c | 51 ++----------------------------------- arch/x86/kernel/fpu/init.c | 1 + arch/x86/kernel/process.c | 10 -------- arch/x86/kvm/x86.c | 11 -------- 5 files changed, 3 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 4ce830fb3f31..9454f21f0edf 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -558,10 +558,6 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } -extern void fpstate_cache_init(void); - -extern int fpstate_alloc(struct fpu *fpu); -extern void fpstate_free(struct fpu *fpu); extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); static inline unsigned long diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1697a9a34ff0..6b8d3e1b6ef8 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -225,34 +225,6 @@ void fpstate_init(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpstate_init); -/* - * FPU state allocation: - */ -static struct kmem_cache *task_xstate_cachep; - -void fpstate_cache_init(void) -{ - task_xstate_cachep = - kmem_cache_create("task_xstate", xstate_size, - __alignof__(union thread_xstate), - SLAB_PANIC | SLAB_NOTRACK, NULL); - setup_xstate_comp(); -} - -int fpstate_alloc(struct fpu *fpu) -{ - /* The CPU requires the FPU state to be aligned to 16 byte boundaries: */ - WARN_ON((unsigned long)&fpu->state & 15); - - return 0; -} -EXPORT_SYMBOL_GPL(fpstate_alloc); - -void fpstate_free(struct fpu *fpu) -{ -} -EXPORT_SYMBOL_GPL(fpstate_free); - /* * Copy the current task's FPU state to a new task's FPU context. * @@ -280,13 +252,9 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; - if (src_fpu->fpstate_active) { - int err = fpstate_alloc(dst_fpu); - - if (err) - return err; + if (src_fpu->fpstate_active) fpu_copy(dst_fpu, src_fpu); - } + return 0; } @@ -305,13 +273,6 @@ int fpstate_alloc_init(struct fpu *fpu) if (WARN_ON_ONCE(fpu->fpstate_active)) return -EINVAL; - /* - * Memory allocation at the first usage of the FPU and other state. - */ - ret = fpstate_alloc(fpu); - if (ret) - return ret; - fpstate_init(fpu); /* Safe to do for the current task: */ @@ -356,13 +317,6 @@ static int fpu__unlazy_stopped(struct fpu *child_fpu) return 0; } - /* - * Memory allocation at the first usage of the FPU and other state. - */ - ret = fpstate_alloc(child_fpu); - if (ret) - return ret; - fpstate_init(child_fpu); /* Safe to do for stopped child tasks: */ @@ -423,7 +377,6 @@ void fpu__clear(struct task_struct *tsk) if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ drop_fpu(fpu); - fpstate_free(fpu); } else { if (!fpu->fpstate_active) { /* kthread execs. TODO: cleanup this horror. */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7ae5a62918c7..460e7e2c6186 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -265,6 +265,7 @@ void fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); + setup_xstate_comp(); fpu__init_system_ctx_switch(); } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 71f4b4d2f1fd..5d37c26fa89f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -86,16 +86,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return fpu__copy(&dst->thread.fpu, &src->thread.fpu); } -void arch_release_task_struct(struct task_struct *tsk) -{ - fpstate_free(&tsk->thread.fpu); -} - -void arch_task_cache_init(void) -{ - fpstate_cache_init(); -} - /* * Free current thread data structures etc.. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8bb0de5bf9c0..68529251e897 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7008,10 +7008,6 @@ int fx_init(struct kvm_vcpu *vcpu) { int err; - err = fpstate_alloc(&vcpu->arch.guest_fpu); - if (err) - return err; - fpstate_init(&vcpu->arch.guest_fpu); if (cpu_has_xsaves) vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = @@ -7028,11 +7024,6 @@ int fx_init(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(fx_init); -static void fx_free(struct kvm_vcpu *vcpu) -{ - fpstate_free(&vcpu->arch.guest_fpu); -} - void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->guest_fpu_loaded) @@ -7070,7 +7061,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -7126,7 +7116,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } -- cgit v1.2.1 From 91d93d0e206432b9fe4c88e64577b93aef018f98 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 06:46:52 +0200 Subject: x86/fpu: Remove failure return from fpstate_alloc_init() Remove the failure code and propagate this down to callers. Note that this function still has an 'init' aspect, which must be called. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 37 +++++++------------------------------ arch/x86/kernel/fpu/xsave.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- arch/x86/math-emu/fpu_entry.c | 8 ++------ 5 files changed, 14 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 9454f21f0edf..1d0c5cee29eb 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -44,7 +44,7 @@ extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); -extern int fpstate_alloc_init(struct fpu *fpu); +extern void fpstate_alloc_init(struct fpu *fpu); extern void fpstate_init(struct fpu *fpu); extern void fpu__clear(struct task_struct *tsk); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 6b8d3e1b6ef8..b44ac5090641 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -259,26 +259,17 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) } /* - * Allocate the backing store for the current task's FPU registers - * and initialize the registers themselves as well. - * - * Can fail. + * Initialize the current task's in-memory FPU context: */ -int fpstate_alloc_init(struct fpu *fpu) +void fpstate_alloc_init(struct fpu *fpu) { - int ret; - - if (WARN_ON_ONCE(fpu != ¤t->thread.fpu)) - return -EINVAL; - if (WARN_ON_ONCE(fpu->fpstate_active)) - return -EINVAL; + WARN_ON_ONCE(fpu != ¤t->thread.fpu); + WARN_ON_ONCE(fpu->fpstate_active); fpstate_init(fpu); /* Safe to do for the current task: */ fpu->fpstate_active = 1; - - return 0; } EXPORT_SYMBOL_GPL(fpstate_alloc_init); @@ -340,20 +331,8 @@ void fpu__restore(void) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - if (!fpu->fpstate_active) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (fpstate_alloc_init(fpu)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } + if (!fpu->fpstate_active) + fpstate_alloc_init(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); @@ -379,9 +358,7 @@ void fpu__clear(struct task_struct *tsk) drop_fpu(fpu); } else { if (!fpu->fpstate_active) { - /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(fpstate_alloc_init(fpu))) - force_sig(SIGKILL, tsk); + fpstate_alloc_init(fpu); user_fpu_begin(); } restore_init_xstate(); diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index c7d48eb0a194..dd2cef08a1a4 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -358,8 +358,8 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - if (!fpu->fpstate_active && fpstate_alloc_init(fpu)) - return -1; + if (!fpu->fpstate_active) + fpstate_alloc_init(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 68529251e897..707f4e27ee91 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6601,8 +6601,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - if (!fpu->fpstate_active && fpstate_alloc_init(fpu)) - return -ENOMEM; + if (!fpu->fpstate_active) + fpstate_alloc_init(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 5e003704ebfa..99ddfc274df3 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -149,12 +149,8 @@ void math_emulate(struct math_emu_info *info) struct desc_struct code_descriptor; struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->fpstate_active) { - if (fpstate_alloc_init(fpu)) { - do_group_exit(SIGKILL); - return; - } - } + if (!fpu->fpstate_active) + fpstate_alloc_init(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { -- cgit v1.2.1 From e62bb3d894d312a37009fb07dc83fd1cc7bed37c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 06:50:29 +0200 Subject: x86/fpu: Rename fpstate_alloc_init() to fpstate_init_curr() Now that there are no FPU context allocations, rename fpstate_alloc_init() to fpstate_init_curr(), to signal that it initializes the fpstate and marks it active, for the current task. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 8 ++++---- arch/x86/kernel/fpu/xsave.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 1d0c5cee29eb..1345ab3dd273 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -44,7 +44,7 @@ extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); -extern void fpstate_alloc_init(struct fpu *fpu); +extern void fpstate_init_curr(struct fpu *fpu); extern void fpstate_init(struct fpu *fpu); extern void fpu__clear(struct task_struct *tsk); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b44ac5090641..45f014e2e204 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -261,7 +261,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Initialize the current task's in-memory FPU context: */ -void fpstate_alloc_init(struct fpu *fpu) +void fpstate_init_curr(struct fpu *fpu) { WARN_ON_ONCE(fpu != ¤t->thread.fpu); WARN_ON_ONCE(fpu->fpstate_active); @@ -271,7 +271,7 @@ void fpstate_alloc_init(struct fpu *fpu) /* Safe to do for the current task: */ fpu->fpstate_active = 1; } -EXPORT_SYMBOL_GPL(fpstate_alloc_init); +EXPORT_SYMBOL_GPL(fpstate_init_curr); /* * This function is called before we modify a stopped child's @@ -332,7 +332,7 @@ void fpu__restore(void) struct fpu *fpu = &tsk->thread.fpu; if (!fpu->fpstate_active) - fpstate_alloc_init(fpu); + fpstate_init_curr(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); @@ -358,7 +358,7 @@ void fpu__clear(struct task_struct *tsk) drop_fpu(fpu); } else { if (!fpu->fpstate_active) { - fpstate_alloc_init(fpu); + fpstate_init_curr(fpu); user_fpu_begin(); } restore_init_xstate(); diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index dd2cef08a1a4..49d9f3dcc2ea 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -359,7 +359,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) return -EACCES; if (!fpu->fpstate_active) - fpstate_alloc_init(fpu); + fpstate_init_curr(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 707f4e27ee91..74b53c314da0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6602,7 +6602,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) sigset_t sigsaved; if (!fpu->fpstate_active) - fpstate_alloc_init(fpu); + fpstate_init_curr(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 99ddfc274df3..4c6ab791d0e5 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -150,7 +150,7 @@ void math_emulate(struct math_emu_info *info) struct fpu *fpu = ¤t->thread.fpu; if (!fpu->fpstate_active) - fpstate_alloc_init(fpu); + fpstate_init_curr(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { -- cgit v1.2.1 From 2fb29fc7c690d3d318471c42a62a05153d433cc1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 06:55:54 +0200 Subject: x86/fpu: Simplify fpu__unlazy_stopped() error handling Now that FPU contexts are always allocated, fpu__unlazy_stopped() cannot fail. Remove its error return and propagate the changes to the callers. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 48 +++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 45f014e2e204..97b4f9e76f60 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -296,24 +296,18 @@ EXPORT_SYMBOL_GPL(fpstate_init_curr); * the read-only case, it's not strictly necessary for * read-only access to the context. */ -static int fpu__unlazy_stopped(struct fpu *child_fpu) +static void fpu__unlazy_stopped(struct fpu *child_fpu) { - int ret; - - if (WARN_ON_ONCE(child_fpu == ¤t->thread.fpu)) - return -EINVAL; + WARN_ON_ONCE(child_fpu == ¤t->thread.fpu); if (child_fpu->fpstate_active) { child_fpu->last_cpu = -1; - return 0; - } - - fpstate_init(child_fpu); - - /* Safe to do for stopped child tasks: */ - child_fpu->fpstate_active = 1; + } else { + fpstate_init(child_fpu); - return 0; + /* Safe to do for stopped child tasks: */ + child_fpu->fpstate_active = 1; + } } /* @@ -389,15 +383,11 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, void *kbuf, void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; - int ret; if (!cpu_has_fxsr) return -ENODEV; - ret = fpu__unlazy_stopped(fpu); - if (ret) - return ret; - + fpu__unlazy_stopped(fpu); sanitize_i387_state(target); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -414,10 +404,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - ret = fpu__unlazy_stopped(fpu); - if (ret) - return ret; - + fpu__unlazy_stopped(fpu); sanitize_i387_state(target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -449,9 +436,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - ret = fpu__unlazy_stopped(fpu); - if (ret) - return ret; + fpu__unlazy_stopped(fpu); xsave = &fpu->state.xsave; @@ -480,9 +465,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - ret = fpu__unlazy_stopped(fpu); - if (ret) - return ret; + fpu__unlazy_stopped(fpu); xsave = &fpu->state.xsave; @@ -643,11 +626,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, { struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - int ret; - ret = fpu__unlazy_stopped(fpu); - if (ret) - return ret; + fpu__unlazy_stopped(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); @@ -677,9 +657,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - ret = fpu__unlazy_stopped(fpu); - if (ret) - return ret; + fpu__unlazy_stopped(fpu); sanitize_i387_state(target); -- cgit v1.2.1 From 0ee6a5172573aea06ef41f4e48737dcfab0099bb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 06:58:22 +0200 Subject: x86/fpu, kvm: Simplify fx_init() Now that fpstate_init() cannot fail the error return of fx_init() has lost its purpose. Eliminate the error return and propagate this change to all callers. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/x86.c | 14 +++----------- 2 files changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dea2e7e962e3..c29e61a8d6d4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -999,8 +999,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -int fx_init(struct kvm_vcpu *vcpu); - void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 74b53c314da0..92a8490cc69d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7004,10 +7004,8 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -int fx_init(struct kvm_vcpu *vcpu) +static void fx_init(struct kvm_vcpu *vcpu) { - int err; - fpstate_init(&vcpu->arch.guest_fpu); if (cpu_has_xsaves) vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = @@ -7019,10 +7017,7 @@ int fx_init(struct kvm_vcpu *vcpu) vcpu->arch.xcr0 = XSTATE_FP; vcpu->arch.cr0 |= X86_CR0_ET; - - return 0; } -EXPORT_SYMBOL_GPL(fx_init); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { @@ -7341,9 +7336,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; } - r = fx_init(vcpu); - if (r) - goto fail_free_wbinvd_dirty_mask; + fx_init(vcpu); vcpu->arch.ia32_tsc_adjust_msr = 0x0; vcpu->arch.pv_time_enabled = false; @@ -7357,8 +7350,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_pmu_init(vcpu); return 0; -fail_free_wbinvd_dirty_mask: - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: -- cgit v1.2.1 From c4d72e2db3a36bf560b506df8a3490f140aeae26 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 07:18:17 +0200 Subject: x86/fpu: Simplify fpstate_init_curr() usage Now that fpstate_init_curr() is not doing implicit allocations anymore, almost all uses of it involve a very simple pattern: if (!fpu->fpstate_active) fpstate_init_curr(fpu); which is basically activating the FPU fpstate if it was not active before. So propagate the check into the function itself, and rename the function according to its new purpose: fpu__activate_curr(fpu); Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 21 +++++++++++---------- arch/x86/kernel/fpu/xsave.c | 3 +-- arch/x86/kvm/x86.c | 3 +-- arch/x86/math-emu/fpu_entry.c | 3 +-- 5 files changed, 15 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 1345ab3dd273..de19fc53f54e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -44,7 +44,7 @@ extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); -extern void fpstate_init_curr(struct fpu *fpu); +extern void fpu__activate_curr(struct fpu *fpu); extern void fpstate_init(struct fpu *fpu); extern void fpu__clear(struct task_struct *tsk); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 97b4f9e76f60..74cc32265918 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -259,19 +259,21 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) } /* - * Initialize the current task's in-memory FPU context: + * Activate the current task's in-memory FPU context, + * if it has not been used before: */ -void fpstate_init_curr(struct fpu *fpu) +void fpu__activate_curr(struct fpu *fpu) { WARN_ON_ONCE(fpu != ¤t->thread.fpu); - WARN_ON_ONCE(fpu->fpstate_active); - fpstate_init(fpu); + if (!fpu->fpstate_active) { + fpstate_init(fpu); - /* Safe to do for the current task: */ - fpu->fpstate_active = 1; + /* Safe to do for the current task: */ + fpu->fpstate_active = 1; + } } -EXPORT_SYMBOL_GPL(fpstate_init_curr); +EXPORT_SYMBOL_GPL(fpu__activate_curr); /* * This function is called before we modify a stopped child's @@ -325,8 +327,7 @@ void fpu__restore(void) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - if (!fpu->fpstate_active) - fpstate_init_curr(fpu); + fpu__activate_curr(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); @@ -352,7 +353,7 @@ void fpu__clear(struct task_struct *tsk) drop_fpu(fpu); } else { if (!fpu->fpstate_active) { - fpstate_init_curr(fpu); + fpu__activate_curr(fpu); user_fpu_begin(); } restore_init_xstate(); diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c index 49d9f3dcc2ea..f549e2a44336 100644 --- a/arch/x86/kernel/fpu/xsave.c +++ b/arch/x86/kernel/fpu/xsave.c @@ -358,8 +358,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - if (!fpu->fpstate_active) - fpstate_init_curr(fpu); + fpu__activate_curr(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92a8490cc69d..9ff4df77e069 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6601,8 +6601,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - if (!fpu->fpstate_active) - fpstate_init_curr(fpu); + fpu__activate_curr(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 4c6ab791d0e5..5b850514eb68 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -149,8 +149,7 @@ void math_emulate(struct math_emu_info *info) struct desc_struct code_descriptor; struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->fpstate_active) - fpstate_init_curr(fpu); + fpu__activate_curr(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { -- cgit v1.2.1 From 67ee658e6fb8611a64dd8406b0081b1fba9dec1b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 07:22:58 +0200 Subject: x86/fpu: Rename fpu__unlazy_stopped() to fpu__activate_stopped() In line with the fpstate_activate() change, name fpu__unlazy_stopped() in a similar fashion as well: its purpose is to make the fpstate of a stopped task the current and active FPU context, which may require unlazying and initialization. The unlazying is just part of the job, the main concept is to make the fpstate active. Also clarify the function's description to clarify its exact usage and the background behind it all. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 74cc32265918..a407bf5cb92f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -276,29 +276,30 @@ void fpu__activate_curr(struct fpu *fpu) EXPORT_SYMBOL_GPL(fpu__activate_curr); /* - * This function is called before we modify a stopped child's - * FPU state context. + * This function must be called before we modify a stopped child's + * fpstate. * * If the child has not used the FPU before then initialize its - * FPU context. + * fpstate. * * If the child has used the FPU before then unlazy it. * - * [ After this function call, after the context is modified and - * the child task is woken up, the child task will restore - * the modified FPU state from the modified context. If we + * [ After this function call, after registers in the fpstate are + * modified and the child task has woken up, the child task will + * restore the modified FPU state from the modified context. If we * didn't clear its lazy status here then the lazy in-registers - * state pending on its former CPU could be restored, losing + * state pending on its former CPU could be restored, corrupting * the modifications. ] * * This function is also called before we read a stopped child's - * FPU state - to make sure it's modified. + * FPU state - to make sure it's initialized if the child has + * no active FPU state. * * TODO: A future optimization would be to skip the unlazying in * the read-only case, it's not strictly necessary for * read-only access to the context. */ -static void fpu__unlazy_stopped(struct fpu *child_fpu) +static void fpu__activate_stopped(struct fpu *child_fpu) { WARN_ON_ONCE(child_fpu == ¤t->thread.fpu); @@ -388,7 +389,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - fpu__unlazy_stopped(fpu); + fpu__activate_stopped(fpu); sanitize_i387_state(target); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -405,7 +406,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - fpu__unlazy_stopped(fpu); + fpu__activate_stopped(fpu); sanitize_i387_state(target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -437,7 +438,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - fpu__unlazy_stopped(fpu); + fpu__activate_stopped(fpu); xsave = &fpu->state.xsave; @@ -466,7 +467,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - fpu__unlazy_stopped(fpu); + fpu__activate_stopped(fpu); xsave = &fpu->state.xsave; @@ -628,7 +629,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - fpu__unlazy_stopped(fpu); + fpu__activate_stopped(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); @@ -658,7 +659,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__unlazy_stopped(fpu); + fpu__activate_stopped(fpu); sanitize_i387_state(target); -- cgit v1.2.1 From 32b49b3c83cad1ba60494a00dad2f511a647fb5a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 08:58:45 +0200 Subject: x86/fpu: Factor out FPU hw activation/deactivation We have repeat patterns of: if (!use_eager_fpu()) clts(); ... to activate FPU registers, and: if (!use_eager_fpu()) stts(); ... to deactivate them. Encapsulate these in: __fpregs_activate_hw(); __fpregs_activate_hw(); and use them accordingly. Doing this synchronizes the idiom with the fpu->fpregs_active software-flag's handling functions, creating clear patterns of: __fpregs_activate_hw(); __fpregs_activate(fpu); etc., which improves readability. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 32 ++++++++++++++++++++++++-------- arch/x86/kernel/fpu/core.c | 7 +++---- 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index de19fc53f54e..28556c6671c3 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -324,14 +324,31 @@ static inline int restore_fpu_checking(struct fpu *fpu) return fpu_restore_checking(fpu); } -/* Must be paired with an 'stts' after! */ +/* + * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' + * idiom, which is then paired with the sw-flag (fpregs_active) later on: + */ + +static inline void __fpregs_activate_hw(void) +{ + if (!use_eager_fpu()) + clts(); +} + +static inline void __fpregs_deactivate_hw(void) +{ + if (!use_eager_fpu()) + stts(); +} + +/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ static inline void __fpregs_deactivate(struct fpu *fpu) { fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); } -/* Must be paired with a 'clts' before! */ +/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ static inline void __fpregs_activate(struct fpu *fpu) { fpu->fpregs_active = 1; @@ -362,16 +379,14 @@ static inline int user_has_fpu(void) */ static inline void fpregs_activate(struct fpu *fpu) { - if (!use_eager_fpu()) - clts(); + __fpregs_activate_hw(); __fpregs_activate(fpu); } static inline void fpregs_deactivate(struct fpu *fpu) { __fpregs_deactivate(fpu); - if (!use_eager_fpu()) - stts(); + __fpregs_deactivate_hw(); } static inline void drop_fpu(struct fpu *fpu) @@ -455,8 +470,9 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) new_fpu->counter++; __fpregs_activate(new_fpu); prefetch(&new_fpu->state); - } else if (!use_eager_fpu()) - stts(); + } else { + __fpregs_deactivate_hw(); + } } else { old_fpu->counter = 0; old_fpu->last_cpu = -1; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a407bf5cb92f..a617aac1cf81 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -105,8 +105,7 @@ void __kernel_fpu_begin(void) copy_fpregs_to_fpstate(fpu); } else { this_cpu_write(fpu_fpregs_owner_ctx, NULL); - if (!use_eager_fpu()) - clts(); + __fpregs_activate_hw(); } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -118,8 +117,8 @@ void __kernel_fpu_end(void) if (fpu->fpregs_active) { if (WARN_ON(restore_fpu_checking(fpu))) fpu_reset_state(fpu); - } else if (!use_eager_fpu()) { - stts(); + } else { + __fpregs_deactivate_hw(); } kernel_fpu_enable(); -- cgit v1.2.1 From 72ee6f87adcb7b7bdb71cc81c22858811ee1a069 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 09:23:43 +0200 Subject: x86/fpu: Simplify __save_fpu() __save_fpu() has this pattern: if (unlikely(system_state == SYSTEM_BOOTING)) xsave_state_booting(&fpu->state.xsave); else xsave_state(&fpu->state.xsave); ... but it does not actually get called during system bootup. So remove the complication and always call xsave_state(). To make sure this assumption is correct, add a WARN_ONCE() debug check to xsave_state(). Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xsave.h | 2 ++ arch/x86/kernel/fpu/core.c | 5 +---- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xsave.h b/arch/x86/include/asm/fpu/xsave.h index a10e66582c1b..2f2ed322263f 100644 --- a/arch/x86/include/asm/fpu/xsave.h +++ b/arch/x86/include/asm/fpu/xsave.h @@ -133,6 +133,8 @@ static inline int xsave_state(struct xsave_struct *fx) u32 hmask = mask >> 32; int err = 0; + WARN_ON(system_state == SYSTEM_BOOTING); + /* * If xsaves is enabled, xsaves replaces xsaveopt because * it supports compact format and supervisor states in addition to diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a617aac1cf81..79a0b99d53b6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -172,10 +172,7 @@ EXPORT_SYMBOL_GPL(irq_ts_restore); static void __save_fpu(struct fpu *fpu) { if (use_xsave()) { - if (unlikely(system_state == SYSTEM_BOOTING)) - xsave_state_booting(&fpu->state.xsave); - else - xsave_state(&fpu->state.xsave); + xsave_state(&fpu->state.xsave); } else { fpu_fxsave(fpu); } -- cgit v1.2.1 From 9f876d67663c8412a386321d94b17b68105b13ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 09:26:41 +0200 Subject: x86/fpu: Eliminate __save_fpu() The current implementation of __save_fpu(): if (use_xsave()) { xsave_state(&fpu->state.xsave); } else { fpu_fxsave(fpu); } Is actually a simplified version of copy_fpregs_to_fpstate(), if use_eager_fpu() is true. But all call sites of __save_fpu() call it only it when use_eager_fpu() is true. So we can eliminate __save_fpu() altogether and use the standard copy_fpregs_to_fpstate() function. This cleans up the code by making it use fewer variants of FPU register saving. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 79a0b99d53b6..00408149de40 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -169,15 +169,6 @@ void irq_ts_restore(int TS_state) } EXPORT_SYMBOL_GPL(irq_ts_restore); -static void __save_fpu(struct fpu *fpu) -{ - if (use_xsave()) { - xsave_state(&fpu->state.xsave); - } else { - fpu_fxsave(fpu); - } -} - /* * Save the FPU state (initialize it if necessary): * @@ -190,7 +181,7 @@ void fpu__save(struct fpu *fpu) preempt_disable(); if (fpu->fpregs_active) { if (use_eager_fpu()) { - __save_fpu(fpu); + copy_fpregs_to_fpstate(fpu); } else { copy_fpregs_to_fpstate(fpu); fpregs_deactivate(fpu); @@ -235,7 +226,7 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) if (use_eager_fpu()) { memset(&dst_fpu->state.xsave, 0, xstate_size); - __save_fpu(dst_fpu); + copy_fpregs_to_fpstate(dst_fpu); } else { fpu__save(src_fpu); memcpy(&dst_fpu->state, &src_fpu->state, xstate_size); -- cgit v1.2.1 From fea435a2027a88407181c387f62daabf30bb5ea7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 09:38:21 +0200 Subject: x86/fpu: Simplify fpu__save() Factor out a common call. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 00408149de40..a0b2221b686d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -180,12 +180,9 @@ void fpu__save(struct fpu *fpu) preempt_disable(); if (fpu->fpregs_active) { - if (use_eager_fpu()) { - copy_fpregs_to_fpstate(fpu); - } else { - copy_fpregs_to_fpstate(fpu); + copy_fpregs_to_fpstate(fpu); + if (!use_eager_fpu()) fpregs_deactivate(fpu); - } } preempt_enable(); } -- cgit v1.2.1 From 48c4717f30cc1f83d02f045c3b47a2885863bff2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 09:45:12 +0200 Subject: x86/fpu: Optimize fpu__save() So fpu__save() does this currently: copy_fpregs_to_fpstate(fpu); if (!use_eager_fpu()) fpregs_deactivate(fpu); ... which deactivates the FPU on lazy switching systems unconditionally. Both usecases of fpu__save() use this function to save the FPU state into a fpstate: fork()/clone() and math error signal handling. The unconditional disabling of FPU registers in the lazy switching case is probably a mistaken conversion of old FNSAVE code (that had to disable FPU registers). So speed up this code by only disabling FPU registers when absolutely necessary: when indicated by the copy_fpregs_to_fpstate() return code: if (!copy_fpregs_to_fpstate(fpu)) fpregs_deactivate(fpu); Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a0b2221b686d..b1fbbb8eb934 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -170,7 +170,7 @@ void irq_ts_restore(int TS_state) EXPORT_SYMBOL_GPL(irq_ts_restore); /* - * Save the FPU state (initialize it if necessary): + * Save the FPU state (mark it for reload if necessary): * * This only ever gets called for the current task. */ @@ -180,8 +180,7 @@ void fpu__save(struct fpu *fpu) preempt_disable(); if (fpu->fpregs_active) { - copy_fpregs_to_fpstate(fpu); - if (!use_eager_fpu()) + if (!copy_fpregs_to_fpstate(fpu)) fpregs_deactivate(fpu); } preempt_enable(); -- cgit v1.2.1 From 68271c6ae726d7ab51e39b7342c838761bf0a25c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 09:59:11 +0200 Subject: x86/fpu: Optimize fpu_copy() Optimize fpu_copy() a bit by expanding the ->fpstate_active == 1 portion of fpu__save() into it. ( The main purpose of this change is to enable another, larger optimization that will be done in the next patch. ) Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b1fbbb8eb934..41ea25a61b5f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -224,7 +224,10 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) memset(&dst_fpu->state.xsave, 0, xstate_size); copy_fpregs_to_fpstate(dst_fpu); } else { - fpu__save(src_fpu); + preempt_disable(); + if (!copy_fpregs_to_fpstate(src_fpu)) + fpregs_deactivate(src_fpu); + preempt_enable(); memcpy(&dst_fpu->state, &src_fpu->state, xstate_size); } } -- cgit v1.2.1 From b16529004f5cc0debf8073d21b560a4677a03a2a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2015 10:08:39 +0200 Subject: x86/fpu: Optimize fpu_copy() some more on lazy switching systems The current fpu_copy() code on lazy switching CPUs always saves into the current fpstate and then copies it over into the child context: preempt_disable(); if (!copy_fpregs_to_fpstate(src_fpu)) fpregs_deactivate(src_fpu); preempt_enable(); memcpy(&dst_fpu->state, &src_fpu->state, xstate_size); That memcpy() can be avoided on all lazy switching setups except really old FNSAVE-only systems: change fpu_copy() to directly save into the child context, for both the lazy and the eager context switching case. Note that we still have to do a memcpy() back into the parent context in the FNSAVE case, but this won't be executed on the majority of x86 systems that got built in the last 10 years or so. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 41ea25a61b5f..edbb5d04a558 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -220,16 +220,35 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) { WARN_ON(src_fpu != ¤t->thread.fpu); - if (use_eager_fpu()) { + /* + * Don't let 'init optimized' areas of the XSAVE area + * leak into the child task: + */ + if (use_eager_fpu()) memset(&dst_fpu->state.xsave, 0, xstate_size); - copy_fpregs_to_fpstate(dst_fpu); - } else { - preempt_disable(); - if (!copy_fpregs_to_fpstate(src_fpu)) - fpregs_deactivate(src_fpu); - preempt_enable(); - memcpy(&dst_fpu->state, &src_fpu->state, xstate_size); + + /* + * Save current FPU registers directly into the child + * FPU context, without any memory-to-memory copying. + * + * If the FPU context got destroyed in the process (FNSAVE + * done on old CPUs) then copy it back into the source + * context and mark the current task for lazy restore. + * + * We have to do all this with preemption disabled, + * mostly because of the FNSAVE case, because in that + * case we must not allow preemption in the window + * between the FNSAVE and us marking the context lazy. + * + * It shouldn't be an issue as even FNSAVE is plenty + * fast in terms of critical section length. + */ + preempt_disable(); + if (!copy_fpregs_to_fpstate(dst_fpu)) { + memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); + fpregs_deactivate(src_fpu); } + preempt_enable(); } int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) -- cgit v1.2.1 From 669ebabb79906302ba6e6922a683893788a134e8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 08:41:33 +0200 Subject: x86/fpu: Rename fpu/xsave.h to fpu/xstate.h 'xsave' is an x86 instruction name to most people - but xsave.h is about a lot more than just the XSAVE instruction: it includes definitions and support, both internal and external, related to xstate and xfeatures support. As a first step in cleaning up the various xstate uses rename this header to 'fpu/xstate.h' to better reflect what this header file is about. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 2 +- arch/x86/crypto/camellia_aesni_avx_glue.c | 2 +- arch/x86/crypto/cast5_avx_glue.c | 2 +- arch/x86/crypto/cast6_avx_glue.c | 2 +- arch/x86/crypto/serpent_avx2_glue.c | 2 +- arch/x86/crypto/serpent_avx_glue.c | 2 +- arch/x86/crypto/sha-mb/sha1_mb.c | 2 +- arch/x86/crypto/sha1_ssse3_glue.c | 2 +- arch/x86/crypto/sha256_ssse3_glue.c | 2 +- arch/x86/crypto/sha512_ssse3_glue.c | 2 +- arch/x86/crypto/twofish_avx_glue.c | 2 +- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/include/asm/fpu/xsave.h | 254 ----------------------------- arch/x86/include/asm/fpu/xstate.h | 254 +++++++++++++++++++++++++++++ arch/x86/kvm/cpuid.c | 2 +- 15 files changed, 267 insertions(+), 267 deletions(-) delete mode 100644 arch/x86/include/asm/fpu/xsave.h create mode 100644 arch/x86/include/asm/fpu/xstate.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 004acd7bb4e0..46484482f267 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 2f7ead8caf53..0122cd95563a 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 2c3360be6fc8..ca4f32e7a423 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #define CAST5_PARALLEL_BLOCKS 16 diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index a2ec18a56e4f..21d0b845c8c4 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #define CAST6_PARALLEL_BLOCKS 8 diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 206ec57725a3..aa325fa5c7a6 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 4feb68c9a41f..f66ae85f58fe 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 03ffaf8c2244..6f3f76568bd5 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -66,7 +66,7 @@ #include #include #include -#include +#include #include #include #include "sha_mb_ctx.h" diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 71ab2b35d5e0..84db12f052e8 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index dcbd8ea6eaaf..eb65522f02b8 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index e8836e0c1098..78914641c72b 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 3b6c8ba64f81..95434f5b705a 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 28556c6671c3..8ec785ecce81 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -17,7 +17,7 @@ #include #include -#include +#include #ifdef CONFIG_X86_64 # include diff --git a/arch/x86/include/asm/fpu/xsave.h b/arch/x86/include/asm/fpu/xsave.h deleted file mode 100644 index 2f2ed322263f..000000000000 --- a/arch/x86/include/asm/fpu/xsave.h +++ /dev/null @@ -1,254 +0,0 @@ -#ifndef __ASM_X86_XSAVE_H -#define __ASM_X86_XSAVE_H - -#include -#include - -#define XSTATE_CPUID 0x0000000d - -#define XSTATE_FP 0x1 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 -#define XSTATE_BNDREGS 0x8 -#define XSTATE_BNDCSR 0x10 -#define XSTATE_OPMASK 0x20 -#define XSTATE_ZMM_Hi256 0x40 -#define XSTATE_Hi16_ZMM 0x80 - -/* The highest xstate bit above (of XSTATE_Hi16_ZMM): */ -#define XFEATURES_NR_MAX 8 - -#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) -#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) -/* Bit 63 of XCR0 is reserved for future expansion */ -#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) - -#define FXSAVE_SIZE 512 - -#define XSAVE_HDR_SIZE 64 -#define XSAVE_HDR_OFFSET FXSAVE_SIZE - -#define XSAVE_YMM_SIZE 256 -#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) - -/* Supported features which support lazy state saving */ -#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) - -/* Supported features which require eager state saving */ -#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) - -/* All currently supported features */ -#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) - -#ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " -#else -#define REX_PREFIX -#endif - -extern unsigned int xstate_size; -extern u64 xfeatures_mask; -extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern struct xsave_struct init_xstate_ctx; - -extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); - -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" - -#define xstate_fault ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err) - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int xsave_state_booting(struct xsave_struct *fx) -{ - u64 mask = -1; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XSAVES"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XSAVE"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XRSTORS"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XRSTOR"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * Save processor xstate to xsave area. - */ -static inline int xsave_state(struct xsave_struct *fx) -{ - u64 mask = -1; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state == SYSTEM_BOOTING); - - /* - * If xsaves is enabled, xsaves replaces xsaveopt because - * it supports compact format and supervisor states in addition to - * modified optimization in xsaveopt. - * - * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave - * because xsaveopt supports modified optimization which is not - * supported by xsave. - * - * If none of xsaves and xsaveopt is enabled, use xsave. - */ - alternative_input_2( - "1:"XSAVE, - XSAVEOPT, - X86_FEATURE_XSAVEOPT, - XSAVES, - X86_FEATURE_XSAVES, - [fx] "D" (fx), "a" (lmask), "d" (hmask) : - "memory"); - asm volatile("2:\n\t" - xstate_fault - : "0" (0) - : "memory"); - - return err; -} - -/* - * Restore processor xstate from xsave area. - */ -static inline int xrstor_state(struct xsave_struct *fx, u64 mask) -{ - int err = 0; - u32 lmask = mask; - u32 hmask = mask >> 32; - - /* - * Use xrstors to restore context if it is enabled. xrstors supports - * compacted format of xsave area which is not supported by xrstor. - */ - alternative_input( - "1: " XRSTOR, - XRSTORS, - X86_FEATURE_XSAVES, - "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - - asm volatile("2:\n" - xstate_fault - : "0" (0) - : "memory"); - - return err; -} - -/* - * Restore xstate context for new process during context switch. - */ -static inline int fpu_xrstor_checking(struct xsave_struct *fx) -{ - return xrstor_state(fx, -1); -} - -/* - * Save xstate to user space xsave area. - * - * We don't use modified optimization because xrstor/xrstors might track - * a different application. - * - * We don't use compacted format xsave area for - * backward compatibility for old applications which don't understand - * compacted format of xsave area. - */ -static inline int xsave_user(struct xsave_struct __user *buf) -{ - int err; - - /* - * Clear the xsave header first, so that reserved fields are - * initialized to zero. - */ - err = __clear_user(&buf->header, sizeof(buf->header)); - if (unlikely(err)) - return -EFAULT; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XSAVE"\n" - "2: " ASM_CLAC "\n" - xstate_fault - : "D" (buf), "a" (-1), "d" (-1), "0" (0) - : "memory"); - return err; -} - -/* - * Restore xstate from user space xsave area. - */ -static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) -{ - int err = 0; - struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); - u32 lmask = mask; - u32 hmask = mask >> 32; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XRSTOR"\n" - "2: " ASM_CLAC "\n" - xstate_fault - : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) - : "memory"); /* memory required? */ - return err; -} - -void *get_xsave_addr(struct xsave_struct *xsave, int xstate); -void setup_xstate_comp(void); - -#endif diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h new file mode 100644 index 000000000000..2f2ed322263f --- /dev/null +++ b/arch/x86/include/asm/fpu/xstate.h @@ -0,0 +1,254 @@ +#ifndef __ASM_X86_XSAVE_H +#define __ASM_X86_XSAVE_H + +#include +#include + +#define XSTATE_CPUID 0x0000000d + +#define XSTATE_FP 0x1 +#define XSTATE_SSE 0x2 +#define XSTATE_YMM 0x4 +#define XSTATE_BNDREGS 0x8 +#define XSTATE_BNDCSR 0x10 +#define XSTATE_OPMASK 0x20 +#define XSTATE_ZMM_Hi256 0x40 +#define XSTATE_Hi16_ZMM 0x80 + +/* The highest xstate bit above (of XSTATE_Hi16_ZMM): */ +#define XFEATURES_NR_MAX 8 + +#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) +/* Bit 63 of XCR0 is reserved for future expansion */ +#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) + +#define FXSAVE_SIZE 512 + +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) + +/* Supported features which support lazy state saving */ +#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) + +/* Supported features which require eager state saving */ +#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) + +/* All currently supported features */ +#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +extern unsigned int xstate_size; +extern u64 xfeatures_mask; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; +extern struct xsave_struct init_xstate_ctx; + +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +#define xstate_fault ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err) + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int xsave_state_booting(struct xsave_struct *fx) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XSAVES"\n\t" + "2:\n\t" + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XSAVE"\n\t" + "2:\n\t" + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + return err; +} + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XRSTORS"\n\t" + "2:\n\t" + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XRSTOR"\n\t" + "2:\n\t" + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + return err; +} + +/* + * Save processor xstate to xsave area. + */ +static inline int xsave_state(struct xsave_struct *fx) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state == SYSTEM_BOOTING); + + /* + * If xsaves is enabled, xsaves replaces xsaveopt because + * it supports compact format and supervisor states in addition to + * modified optimization in xsaveopt. + * + * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave + * because xsaveopt supports modified optimization which is not + * supported by xsave. + * + * If none of xsaves and xsaveopt is enabled, use xsave. + */ + alternative_input_2( + "1:"XSAVE, + XSAVEOPT, + X86_FEATURE_XSAVEOPT, + XSAVES, + X86_FEATURE_XSAVES, + [fx] "D" (fx), "a" (lmask), "d" (hmask) : + "memory"); + asm volatile("2:\n\t" + xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * Restore processor xstate from xsave area. + */ +static inline int xrstor_state(struct xsave_struct *fx, u64 mask) +{ + int err = 0; + u32 lmask = mask; + u32 hmask = mask >> 32; + + /* + * Use xrstors to restore context if it is enabled. xrstors supports + * compacted format of xsave area which is not supported by xrstor. + */ + alternative_input( + "1: " XRSTOR, + XRSTORS, + X86_FEATURE_XSAVES, + "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile("2:\n" + xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * Restore xstate context for new process during context switch. + */ +static inline int fpu_xrstor_checking(struct xsave_struct *fx) +{ + return xrstor_state(fx, -1); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for + * backward compatibility for old applications which don't understand + * compacted format of xsave area. + */ +static inline int xsave_user(struct xsave_struct __user *buf) +{ + int err; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + err = __clear_user(&buf->header, sizeof(buf->header)); + if (unlikely(err)) + return -EFAULT; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XSAVE"\n" + "2: " ASM_CLAC "\n" + xstate_fault + : "D" (buf), "a" (-1), "d" (-1), "0" (0) + : "memory"); + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) +{ + int err = 0; + struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XRSTOR"\n" + "2: " ASM_CLAC "\n" + xstate_fault + : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) + : "memory"); /* memory required? */ + return err; +} + +void *get_xsave_addr(struct xsave_struct *xsave, int xstate); +void setup_xstate_comp(void); + +#endif diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0ce4c4f87332..2426e6530d3c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" -- cgit v1.2.1 From 62784854502895321d100b1004b486131b9dd286 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 08:46:23 +0200 Subject: x86/fpu: Rename fpu/xsave.c to fpu/xstate.c Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/Makefile | 2 +- arch/x86/kernel/fpu/xsave.c | 714 ------------------------------------------- arch/x86/kernel/fpu/xstate.c | 714 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 715 insertions(+), 715 deletions(-) delete mode 100644 arch/x86/kernel/fpu/xsave.c create mode 100644 arch/x86/kernel/fpu/xstate.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile index 2020a2b7a597..6ae59bccdd2f 100644 --- a/arch/x86/kernel/fpu/Makefile +++ b/arch/x86/kernel/fpu/Makefile @@ -2,4 +2,4 @@ # Build rules for the FPU support code: # -obj-y += init.o bugs.o core.o xsave.o +obj-y += init.o bugs.o core.o xstate.o diff --git a/arch/x86/kernel/fpu/xsave.c b/arch/x86/kernel/fpu/xsave.c deleted file mode 100644 index f549e2a44336..000000000000 --- a/arch/x86/kernel/fpu/xsave.c +++ /dev/null @@ -1,714 +0,0 @@ -/* - * xsave/xrstor support. - * - * Author: Suresh Siddha - */ -#include -#include -#include -#include -#include -#include -#include - -/* - * Mask of xstate features supported by the CPU and the kernel: - */ -u64 xfeatures_mask; - -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct init_xstate_ctx; - -static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; -static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX]; -static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; - -/* The number of supported xfeatures in xfeatures_mask: */ -static unsigned int xfeatures_nr; - -/* - * When executing XSAVEOPT (optimized XSAVE), if a processor implementation - * detects that an FPU state component is still (or is again) in its - * initialized state, it may clear the corresponding bit in the header.xfeatures - * field, and can skip the writeout of registers to the corresponding memory layout. - * - * This means that when the bit is zero, the state component might still contain - * some previous - non-initialized register state. - * - * Before writing xstate information to user-space we sanitize those components, - * to always ensure that the memory layout of a feature will be in the init state - * if the corresponding header bit is zero. This is to ensure that user-space doesn't - * see some stale state in the memory layout during signal handling, debugging etc. - */ -void __sanitize_i387_state(struct task_struct *tsk) -{ - struct i387_fxsave_struct *fx = &tsk->thread.fpu.state.fxsave; - int feature_bit; - u64 xfeatures; - - if (!fx) - return; - - xfeatures = tsk->thread.fpu.state.xsave.header.xfeatures; - - /* - * None of the feature bits are in init state. So nothing else - * to do for us, as the memory layout is up to date. - */ - if ((xfeatures & xfeatures_mask) == xfeatures_mask) - return; - - /* - * FP is in init state - */ - if (!(xfeatures & XSTATE_FP)) { - fx->cwd = 0x37f; - fx->swd = 0; - fx->twd = 0; - fx->fop = 0; - fx->rip = 0; - fx->rdp = 0; - memset(&fx->st_space[0], 0, 128); - } - - /* - * SSE is in init state - */ - if (!(xfeatures & XSTATE_SSE)) - memset(&fx->xmm_space[0], 0, 256); - - /* - * First two features are FPU and SSE, which above we handled - * in a special way already: - */ - feature_bit = 0x2; - xfeatures = (xfeatures_mask & ~xfeatures) >> 2; - - /* - * Update all the remaining memory layouts according to their - * standard xstate layout, if their header bit is in the init - * state: - */ - while (xfeatures) { - if (xfeatures & 0x1) { - int offset = xstate_offsets[feature_bit]; - int size = xstate_sizes[feature_bit]; - - memcpy((void *)fx + offset, - (void *)&init_xstate_ctx + offset, - size); - } - - xfeatures >>= 1; - feature_bit++; - } -} - -/* - * Check for the presence of extended state information in the - * user fpstate pointer in the sigcontext. - */ -static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *fx_sw) -{ - int min_xstate_size = sizeof(struct i387_fxsave_struct) + - sizeof(struct xstate_header); - unsigned int magic2; - - if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) - return -1; - - /* Check for the first magic field and other error scenarios. */ - if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || - fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > xstate_size || - fx_sw->xstate_size > fx_sw->extended_size) - return -1; - - /* - * Check for the presence of second magic word at the end of memory - * layout. This detects the case where the user just copied the legacy - * fpstate layout with out copying the extended state information - * in the memory layout. - */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) - || magic2 != FP_XSTATE_MAGIC2) - return -1; - - return 0; -} - -/* - * Signal frame handlers. - */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) -{ - if (use_fxsr()) { - struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; - struct user_i387_ia32_struct env; - struct _fpstate_ia32 __user *fp = buf; - - convert_from_fxsr(&env, tsk); - - if (__copy_to_user(buf, &env, sizeof(env)) || - __put_user(xsave->i387.swd, &fp->status) || - __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; - } else { - struct i387_fsave_struct __user *fp = buf; - u32 swd; - if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; - } - - return 0; -} - -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) -{ - struct xsave_struct __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; - u32 xfeatures; - int err; - - /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); - - if (!use_xsave()) - return err; - - err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); - - /* - * Read the xfeatures which we copied (directly from the cpu or - * from the state in task struct) to the user buffers. - */ - err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); - - /* - * For legacy compatible, we always set FP/SSE bits in the bit - * vector while saving the state to the user context. This will - * enable us capturing any changes(during sigreturn) to - * the FP/SSE bits by the legacy applications which don't touch - * xfeatures in the xsave header. - * - * xsave aware apps can change the xfeatures in the xsave - * header as well as change any contents in the memory layout. - * xrestore as part of sigreturn will capture all the changes. - */ - xfeatures |= XSTATE_FPSSE; - - err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); - - return err; -} - -static inline int save_user_xstate(struct xsave_struct __user *buf) -{ - int err; - - if (use_xsave()) - err = xsave_user(buf); - else if (use_fxsr()) - err = fxsave_user((struct i387_fxsave_struct __user *) buf); - else - err = fsave_user((struct i387_fsave_struct __user *) buf); - - if (unlikely(err) && __clear_user(buf, xstate_size)) - err = -EFAULT; - return err; -} - -/* - * Save the fpu, extended register state to the user signal frame. - * - * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save - * state is copied. - * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. - * - * buf == buf_fx for 64-bit frames and 32-bit fsave frame. - * buf != buf_fx for 32-bit frames with fxstate. - * - * If the fpu, extended register state is live, save the state directly - * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, - * copy the thread's fpu state to the user frame starting at 'buf_fx'. - * - * If this is a 32-bit frame with fxstate, put a fsave header before - * the aligned state at 'buf_fx'. - * - * For [f]xsave state, update the SW reserved fields in the [f]xsave frame - * indicating the absence/presence of the extended state to the user. - */ -int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) -{ - struct xsave_struct *xsave = ¤t->thread.fpu.state.xsave; - struct task_struct *tsk = current; - int ia32_fxstate = (buf != buf_fx); - - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); - - if (!access_ok(VERIFY_WRITE, buf, size)) - return -EACCES; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), NULL, - (struct _fpstate_ia32 __user *) buf) ? -1 : 1; - - if (user_has_fpu()) { - /* Save the live register state to the user directly. */ - if (save_user_xstate(buf_fx)) - return -1; - /* Update the thread's fxstate to save the fsave header. */ - if (ia32_fxstate) - fpu_fxsave(&tsk->thread.fpu); - } else { - sanitize_i387_state(tsk); - if (__copy_to_user(buf_fx, xsave, xstate_size)) - return -1; - } - - /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; - - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; - - return 0; -} - -static inline void -sanitize_restored_xstate(struct task_struct *tsk, - struct user_i387_ia32_struct *ia32_env, - u64 xfeatures, int fx_only) -{ - struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; - struct xstate_header *header = &xsave->header; - - if (use_xsave()) { - /* These bits must be zero. */ - memset(header->reserved, 0, 48); - - /* - * Init the state that is not present in the memory - * layout and not enabled by the OS. - */ - if (fx_only) - header->xfeatures = XSTATE_FPSSE; - else - header->xfeatures &= (xfeatures_mask & xfeatures); - } - - if (use_fxsr()) { - /* - * mscsr reserved bits must be masked to zero for security - * reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - - convert_to_fxsr(tsk, ia32_env); - } -} - -/* - * Restore the extended state if present. Otherwise, restore the FP/SSE state. - */ -static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) -{ - if (use_xsave()) { - if ((unsigned long)buf % 64 || fx_only) { - u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; - xrstor_state(&init_xstate_ctx, init_bv); - return fxrstor_user(buf); - } else { - u64 init_bv = xfeatures_mask & ~xbv; - if (unlikely(init_bv)) - xrstor_state(&init_xstate_ctx, init_bv); - return xrestore_user(buf, xbv); - } - } else if (use_fxsr()) { - return fxrstor_user(buf); - } else - return frstor_user(buf); -} - -int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) -{ - int ia32_fxstate = (buf != buf_fx); - struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; - int state_size = xstate_size; - u64 xfeatures = 0; - int fx_only = 0; - - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); - - if (!buf) { - fpu_reset_state(fpu); - return 0; - } - - if (!access_ok(VERIFY_READ, buf, size)) - return -EACCES; - - fpu__activate_curr(fpu); - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; - - if (use_xsave()) { - struct _fpx_sw_bytes fx_sw_user; - if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { - /* - * Couldn't find the extended state information in the - * memory layout. Restore just the FP/SSE and init all - * the other extended state. - */ - state_size = sizeof(struct i387_fxsave_struct); - fx_only = 1; - } else { - state_size = fx_sw_user.xstate_size; - xfeatures = fx_sw_user.xfeatures; - } - } - - if (ia32_fxstate) { - /* - * For 32-bit frames with fxstate, copy the user state to the - * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. - */ - struct fpu *fpu = &tsk->thread.fpu; - struct user_i387_ia32_struct env; - int err = 0; - - /* - * Drop the current fpu which clears fpu->fpstate_active. This ensures - * that any context-switch during the copy of the new state, - * avoids the intermediate state from getting restored/saved. - * Thus avoiding the new restored state from getting corrupted. - * We will be ready to restore/save the state only after - * fpu->fpstate_active is again set. - */ - drop_fpu(fpu); - - if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { - fpstate_init(fpu); - err = -1; - } else { - sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); - } - - fpu->fpstate_active = 1; - if (use_eager_fpu()) { - preempt_disable(); - fpu__restore(); - preempt_enable(); - } - - return err; - } else { - /* - * For 64-bit frames and 32-bit fsave frames, restore the user - * state to the registers directly (with exceptions handled). - */ - user_fpu_begin(); - if (restore_user_xstate(buf_fx, xfeatures, fx_only)) { - fpu_reset_state(fpu); - return -1; - } - } - - return 0; -} - -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -static void prepare_fx_sw_frame(void) -{ - int fsave_header_size = sizeof(struct i387_fsave_struct); - int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask; - fx_sw_reserved.xstate_size = xstate_size; - - if (config_enabled(CONFIG_IA32_EMULATION)) { - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; - } -} - -/* - * Enable the extended processor state save/restore feature. - * Called once per CPU onlining. - */ -void fpu__init_cpu_xstate(void) -{ - if (!cpu_has_xsave || !xfeatures_mask) - return; - - cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); -} - -/* - * Record the offsets and sizes of different state managed by the xsave - * memory layout. - */ -static void __init setup_xstate_features(void) -{ - int eax, ebx, ecx, edx, leaf = 0x2; - - xfeatures_nr = fls64(xfeatures_mask); - - do { - cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); - - if (eax == 0) - break; - - xstate_offsets[leaf] = ebx; - xstate_sizes[leaf] = eax; - - leaf++; - } while (1); -} - -static void print_xstate_feature(u64 xstate_mask, const char *desc) -{ - if (xfeatures_mask & xstate_mask) { - int xstate_feature = fls64(xstate_mask)-1; - - pr_info("x86/fpu: Supporting XSAVE feature %2d: '%s'\n", xstate_feature, desc); - } -} - -/* - * Print out all the supported xstate features: - */ -static void print_xstate_features(void) -{ - print_xstate_feature(XSTATE_FP, "x87 floating point registers"); - print_xstate_feature(XSTATE_SSE, "SSE registers"); - print_xstate_feature(XSTATE_YMM, "AVX registers"); - print_xstate_feature(XSTATE_BNDREGS, "MPX bounds registers"); - print_xstate_feature(XSTATE_BNDCSR, "MPX CSR"); - print_xstate_feature(XSTATE_OPMASK, "AVX-512 opmask"); - print_xstate_feature(XSTATE_ZMM_Hi256, "AVX-512 Hi256"); - print_xstate_feature(XSTATE_Hi16_ZMM, "AVX-512 ZMM_Hi256"); -} - -/* - * This function sets up offsets and sizes of all extended states in - * xsave area. This supports both standard format and compacted format - * of the xsave aread. - * - * Input: void - * Output: void - */ -void setup_xstate_comp(void) -{ - unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; - int i; - - /* - * The FP xstates and SSE xstates are legacy states. They are always - * in the fixed offsets in the xsave area in either compacted form - * or standard form. - */ - xstate_comp_offsets[0] = 0; - xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); - - if (!cpu_has_xsaves) { - for (i = 2; i < xfeatures_nr; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) { - xstate_comp_offsets[i] = xstate_offsets[i]; - xstate_comp_sizes[i] = xstate_sizes[i]; - } - } - return; - } - - xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; - - for (i = 2; i < xfeatures_nr; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) - xstate_comp_sizes[i] = xstate_sizes[i]; - else - xstate_comp_sizes[i] = 0; - - if (i > 2) - xstate_comp_offsets[i] = xstate_comp_offsets[i-1] - + xstate_comp_sizes[i-1]; - - } -} - -/* - * setup the xstate image representing the init state - */ -static void setup_init_fpu_buf(void) -{ - static int on_boot_cpu = 1; - - if (!on_boot_cpu) - return; - on_boot_cpu = 0; - - if (!cpu_has_xsave) - return; - - setup_xstate_features(); - print_xstate_features(); - - if (cpu_has_xsaves) { - init_xstate_ctx.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; - init_xstate_ctx.header.xfeatures = xfeatures_mask; - } - - /* - * Init all the features state with header_bv being 0x0 - */ - xrstor_state_booting(&init_xstate_ctx, -1); - - /* - * Dump the init state again. This is to identify the init state - * of any feature which is not represented by all zero's. - */ - xsave_state_booting(&init_xstate_ctx); -} - -/* - * Calculate total size of enabled xstates in XCR0/xfeatures_mask. - */ -static void __init init_xstate_size(void) -{ - unsigned int eax, ebx, ecx, edx; - int i; - - if (!cpu_has_xsaves) { - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; - return; - } - - xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < 64; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_size += eax; - } - } -} - -/* - * Enable and initialize the xsave feature. - * Called once per system bootup. - * - * ( Not marked __init because of false positive section warnings. ) - */ -void fpu__init_system_xstate(void) -{ - unsigned int eax, ebx, ecx, edx; - static bool on_boot_cpu = 1; - - if (!on_boot_cpu) - return; - on_boot_cpu = 0; - - if (!cpu_has_xsave) { - pr_info("x86/fpu: Legacy x87 FPU detected.\n"); - return; - } - - if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { - WARN(1, "x86/fpu: XSTATE_CPUID missing!\n"); - return; - } - - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask = eax + ((u64)edx << 32); - - if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); - BUG(); - } - - /* - * Support only the state known to OS. - */ - xfeatures_mask = xfeatures_mask & XCNTXT_MASK; - - /* Enable xstate instructions to be able to continue with initialization: */ - fpu__init_cpu_xstate(); - - /* - * Recompute the context size for enabled features - */ - init_xstate_size(); - - update_regset_xstate_info(xstate_size, xfeatures_mask); - prepare_fx_sw_frame(); - setup_init_fpu_buf(); - - pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", - xfeatures_mask, - xstate_size, - cpu_has_xsaves ? "compacted" : "standard"); -} - -/* - * Restore minimal FPU state after suspend: - */ -void fpu__resume_cpu(void) -{ - /* - * Restore XCR0 on xsave capable CPUs: - */ - if (cpu_has_xsave) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); -} - -/* - * Given the xsave area and a state inside, this function returns the - * address of the state. - * - * This is the API that is called to get xstate address in either - * standard format or compacted format of xsave area. - * - * Inputs: - * xsave: base address of the xsave area; - * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, - * etc.) - * Output: - * address of the state in the xsave area. - */ -void *get_xsave_addr(struct xsave_struct *xsave, int xstate) -{ - int feature = fls64(xstate) - 1; - if (!test_bit(feature, (unsigned long *)&xfeatures_mask)) - return NULL; - - return (void *)xsave + xstate_comp_offsets[feature]; -} -EXPORT_SYMBOL_GPL(get_xsave_addr); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c new file mode 100644 index 000000000000..f549e2a44336 --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.c @@ -0,0 +1,714 @@ +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha + */ +#include +#include +#include +#include +#include +#include +#include + +/* + * Mask of xstate features supported by the CPU and the kernel: + */ +u64 xfeatures_mask; + +/* + * Represents init state for the supported extended state. + */ +struct xsave_struct init_xstate_ctx; + +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; +static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX]; +static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; + +/* The number of supported xfeatures in xfeatures_mask: */ +static unsigned int xfeatures_nr; + +/* + * When executing XSAVEOPT (optimized XSAVE), if a processor implementation + * detects that an FPU state component is still (or is again) in its + * initialized state, it may clear the corresponding bit in the header.xfeatures + * field, and can skip the writeout of registers to the corresponding memory layout. + * + * This means that when the bit is zero, the state component might still contain + * some previous - non-initialized register state. + * + * Before writing xstate information to user-space we sanitize those components, + * to always ensure that the memory layout of a feature will be in the init state + * if the corresponding header bit is zero. This is to ensure that user-space doesn't + * see some stale state in the memory layout during signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state.fxsave; + int feature_bit; + u64 xfeatures; + + if (!fx) + return; + + xfeatures = tsk->thread.fpu.state.xsave.header.xfeatures; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is up to date. + */ + if ((xfeatures & xfeatures_mask) == xfeatures_mask) + return; + + /* + * FP is in init state + */ + if (!(xfeatures & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xfeatures & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + /* + * First two features are FPU and SSE, which above we handled + * in a special way already: + */ + feature_bit = 0x2; + xfeatures = (xfeatures_mask & ~xfeatures) >> 2; + + /* + * Update all the remaining memory layouts according to their + * standard xstate layout, if their header bit is in the init + * state: + */ + while (xfeatures) { + if (xfeatures & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy((void *)fx + offset, + (void *)&init_xstate_ctx + offset, + size); + } + + xfeatures >>= 1; + feature_bit++; + } +} + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw) +{ + int min_xstate_size = sizeof(struct i387_fxsave_struct) + + sizeof(struct xstate_header); + unsigned int magic2; + + if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) + return -1; + + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fx_sw->extended_size) + return -1; + + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) + || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + +/* + * Signal frame handlers. + */ +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +{ + if (use_fxsr()) { + struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; + struct user_i387_ia32_struct env; + struct _fpstate_ia32 __user *fp = buf; + + convert_from_fxsr(&env, tsk); + + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return -1; + } else { + struct i387_fsave_struct __user *fp = buf; + u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) + return -1; + } + + return 0; +} + +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +{ + struct xsave_struct __user *x = buf; + struct _fpx_sw_bytes *sw_bytes; + u32 xfeatures; + int err; + + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; + err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + + if (!use_xsave()) + return err; + + err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + + /* + * Read the xfeatures which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xfeatures in the xsave header. + * + * xsave aware apps can change the xfeatures in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xfeatures |= XSTATE_FPSSE; + + err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); + + return err; +} + +static inline int save_user_xstate(struct xsave_struct __user *buf) +{ + int err; + + if (use_xsave()) + err = xsave_user(buf); + else if (use_fxsr()) + err = fxsave_user((struct i387_fxsave_struct __user *) buf); + else + err = fsave_user((struct i387_fsave_struct __user *) buf); + + if (unlikely(err) && __clear_user(buf, xstate_size)) + err = -EFAULT; + return err; +} + +/* + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * If the fpu, extended register state is live, save the state directly + * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, + * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. + */ +int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ + struct xsave_struct *xsave = ¤t->thread.fpu.state.xsave; + struct task_struct *tsk = current; + int ia32_fxstate = (buf != buf_fx); + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!access_ok(VERIFY_WRITE, buf, size)) + return -EACCES; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), NULL, + (struct _fpstate_ia32 __user *) buf) ? -1 : 1; + + if (user_has_fpu()) { + /* Save the live register state to the user directly. */ + if (save_user_xstate(buf_fx)) + return -1; + /* Update the thread's fxstate to save the fsave header. */ + if (ia32_fxstate) + fpu_fxsave(&tsk->thread.fpu); + } else { + sanitize_i387_state(tsk); + if (__copy_to_user(buf_fx, xsave, xstate_size)) + return -1; + } + + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + return -1; + + if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + return -1; + + return 0; +} + +static inline void +sanitize_restored_xstate(struct task_struct *tsk, + struct user_i387_ia32_struct *ia32_env, + u64 xfeatures, int fx_only) +{ + struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; + struct xstate_header *header = &xsave->header; + + if (use_xsave()) { + /* These bits must be zero. */ + memset(header->reserved, 0, 48); + + /* + * Init the state that is not present in the memory + * layout and not enabled by the OS. + */ + if (fx_only) + header->xfeatures = XSTATE_FPSSE; + else + header->xfeatures &= (xfeatures_mask & xfeatures); + } + + if (use_fxsr()) { + /* + * mscsr reserved bits must be masked to zero for security + * reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + + convert_to_fxsr(tsk, ia32_env); + } +} + +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE state. + */ +static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) +{ + if (use_xsave()) { + if ((unsigned long)buf % 64 || fx_only) { + u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; + xrstor_state(&init_xstate_ctx, init_bv); + return fxrstor_user(buf); + } else { + u64 init_bv = xfeatures_mask & ~xbv; + if (unlikely(init_bv)) + xrstor_state(&init_xstate_ctx, init_bv); + return xrestore_user(buf, xbv); + } + } else if (use_fxsr()) { + return fxrstor_user(buf); + } else + return frstor_user(buf); +} + +int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ + int ia32_fxstate = (buf != buf_fx); + struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; + int state_size = xstate_size; + u64 xfeatures = 0; + int fx_only = 0; + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!buf) { + fpu_reset_state(fpu); + return 0; + } + + if (!access_ok(VERIFY_READ, buf, size)) + return -EACCES; + + fpu__activate_curr(fpu); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { + /* + * Couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + state_size = sizeof(struct i387_fxsave_struct); + fx_only = 1; + } else { + state_size = fx_sw_user.xstate_size; + xfeatures = fx_sw_user.xfeatures; + } + } + + if (ia32_fxstate) { + /* + * For 32-bit frames with fxstate, copy the user state to the + * thread's fpu state, reconstruct fxstate from the fsave + * header. Sanitize the copied state etc. + */ + struct fpu *fpu = &tsk->thread.fpu; + struct user_i387_ia32_struct env; + int err = 0; + + /* + * Drop the current fpu which clears fpu->fpstate_active. This ensures + * that any context-switch during the copy of the new state, + * avoids the intermediate state from getting restored/saved. + * Thus avoiding the new restored state from getting corrupted. + * We will be ready to restore/save the state only after + * fpu->fpstate_active is again set. + */ + drop_fpu(fpu); + + if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || + __copy_from_user(&env, buf, sizeof(env))) { + fpstate_init(fpu); + err = -1; + } else { + sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); + } + + fpu->fpstate_active = 1; + if (use_eager_fpu()) { + preempt_disable(); + fpu__restore(); + preempt_enable(); + } + + return err; + } else { + /* + * For 64-bit frames and 32-bit fsave frames, restore the user + * state to the registers directly (with exceptions handled). + */ + user_fpu_begin(); + if (restore_user_xstate(buf_fx, xfeatures, fx_only)) { + fpu_reset_state(fpu); + return -1; + } + } + + return 0; +} + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static void prepare_fx_sw_frame(void) +{ + int fsave_header_size = sizeof(struct i387_fsave_struct); + int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + + if (config_enabled(CONFIG_X86_32)) + size += fsave_header_size; + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = size; + fx_sw_reserved.xfeatures = xfeatures_mask; + fx_sw_reserved.xstate_size = xstate_size; + + if (config_enabled(CONFIG_IA32_EMULATION)) { + fx_sw_reserved_ia32 = fx_sw_reserved; + fx_sw_reserved_ia32.extended_size += fsave_header_size; + } +} + +/* + * Enable the extended processor state save/restore feature. + * Called once per CPU onlining. + */ +void fpu__init_cpu_xstate(void) +{ + if (!cpu_has_xsave || !xfeatures_mask) + return; + + cr4_set_bits(X86_CR4_OSXSAVE); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); +} + +/* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void __init setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xfeatures_nr = fls64(xfeatures_mask); + + do { + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + +static void print_xstate_feature(u64 xstate_mask, const char *desc) +{ + if (xfeatures_mask & xstate_mask) { + int xstate_feature = fls64(xstate_mask)-1; + + pr_info("x86/fpu: Supporting XSAVE feature %2d: '%s'\n", xstate_feature, desc); + } +} + +/* + * Print out all the supported xstate features: + */ +static void print_xstate_features(void) +{ + print_xstate_feature(XSTATE_FP, "x87 floating point registers"); + print_xstate_feature(XSTATE_SSE, "SSE registers"); + print_xstate_feature(XSTATE_YMM, "AVX registers"); + print_xstate_feature(XSTATE_BNDREGS, "MPX bounds registers"); + print_xstate_feature(XSTATE_BNDCSR, "MPX CSR"); + print_xstate_feature(XSTATE_OPMASK, "AVX-512 opmask"); + print_xstate_feature(XSTATE_ZMM_Hi256, "AVX-512 Hi256"); + print_xstate_feature(XSTATE_Hi16_ZMM, "AVX-512 ZMM_Hi256"); +} + +/* + * This function sets up offsets and sizes of all extended states in + * xsave area. This supports both standard format and compacted format + * of the xsave aread. + * + * Input: void + * Output: void + */ +void setup_xstate_comp(void) +{ + unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; + int i; + + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_comp_offsets[0] = 0; + xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); + + if (!cpu_has_xsaves) { + for (i = 2; i < xfeatures_nr; i++) { + if (test_bit(i, (unsigned long *)&xfeatures_mask)) { + xstate_comp_offsets[i] = xstate_offsets[i]; + xstate_comp_sizes[i] = xstate_sizes[i]; + } + } + return; + } + + xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = 2; i < xfeatures_nr; i++) { + if (test_bit(i, (unsigned long *)&xfeatures_mask)) + xstate_comp_sizes[i] = xstate_sizes[i]; + else + xstate_comp_sizes[i] = 0; + + if (i > 2) + xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + + xstate_comp_sizes[i-1]; + + } +} + +/* + * setup the xstate image representing the init state + */ +static void setup_init_fpu_buf(void) +{ + static int on_boot_cpu = 1; + + if (!on_boot_cpu) + return; + on_boot_cpu = 0; + + if (!cpu_has_xsave) + return; + + setup_xstate_features(); + print_xstate_features(); + + if (cpu_has_xsaves) { + init_xstate_ctx.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; + init_xstate_ctx.header.xfeatures = xfeatures_mask; + } + + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state_booting(&init_xstate_ctx, -1); + + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state_booting(&init_xstate_ctx); +} + +/* + * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + */ +static void __init init_xstate_size(void) +{ + unsigned int eax, ebx, ecx, edx; + int i; + + if (!cpu_has_xsaves) { + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xstate_size = ebx; + return; + } + + xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for (i = 2; i < 64; i++) { + if (test_bit(i, (unsigned long *)&xfeatures_mask)) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_size += eax; + } + } +} + +/* + * Enable and initialize the xsave feature. + * Called once per system bootup. + * + * ( Not marked __init because of false positive section warnings. ) + */ +void fpu__init_system_xstate(void) +{ + unsigned int eax, ebx, ecx, edx; + static bool on_boot_cpu = 1; + + if (!on_boot_cpu) + return; + on_boot_cpu = 0; + + if (!cpu_has_xsave) { + pr_info("x86/fpu: Legacy x87 FPU detected.\n"); + return; + } + + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN(1, "x86/fpu: XSTATE_CPUID missing!\n"); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xfeatures_mask = eax + ((u64)edx << 32); + + if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); + BUG(); + } + + /* + * Support only the state known to OS. + */ + xfeatures_mask = xfeatures_mask & XCNTXT_MASK; + + /* Enable xstate instructions to be able to continue with initialization: */ + fpu__init_cpu_xstate(); + + /* + * Recompute the context size for enabled features + */ + init_xstate_size(); + + update_regset_xstate_info(xstate_size, xfeatures_mask); + prepare_fx_sw_frame(); + setup_init_fpu_buf(); + + pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", + xfeatures_mask, + xstate_size, + cpu_has_xsaves ? "compacted" : "standard"); +} + +/* + * Restore minimal FPU state after suspend: + */ +void fpu__resume_cpu(void) +{ + /* + * Restore XCR0 on xsave capable CPUs: + */ + if (cpu_has_xsave) + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); +} + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Inputs: + * xsave: base address of the xsave area; + * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, + * etc.) + * Output: + * address of the state in the xsave area. + */ +void *get_xsave_addr(struct xsave_struct *xsave, int xstate) +{ + int feature = fls64(xstate) - 1; + if (!test_bit(feature, (unsigned long *)&xfeatures_mask)) + return NULL; + + return (void *)xsave + xstate_comp_offsets[feature]; +} +EXPORT_SYMBOL_GPL(get_xsave_addr); -- cgit v1.2.1 From 5b07343034ec9ebb0d443743c7381b25875d0067 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 08:51:17 +0200 Subject: x86/fpu: Introduce cpu_has_xfeatures(xfeatures_mask, feature_name) A lot of FPU using driver code is querying complex CPU features to be able to figure out whether a given set of xstate features is supported by the CPU or not. Introduce a simplified API function that can be used on any CPU type to get this information. Also add an error string return pointer, so that the driver can print a meaningful error message with a standardized feature name. Also mark xfeatures_mask as __read_only. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/api.h | 9 +++++++ arch/x86/kernel/fpu/xstate.c | 53 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 62035cc1d961..1429a7c736db 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -36,4 +36,13 @@ extern bool irq_fpu_usable(void); extern int irq_ts_save(void); extern void irq_ts_restore(int TS_state); +/* + * Query the presence of one or more xfeatures. Works on any legacy CPU as well. + * + * If 'feature_name' is set then put a human-readable description of + * the feature there as well - this can be used to print error (or success) + * messages. + */ +extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f549e2a44336..2e52f01f4931 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -11,10 +11,23 @@ #include #include +static const char *xfeature_names[] = +{ + "x87 floating point registers" , + "SSE registers" , + "AVX registers" , + "MPX bounds registers" , + "MPX CSR" , + "AVX-512 opmask" , + "AVX-512 Hi256" , + "AVX-512 ZMM_Hi256" , + "unknown xstate feature" , +}; + /* * Mask of xstate features supported by the CPU and the kernel: */ -u64 xfeatures_mask; +u64 xfeatures_mask __read_mostly; /* * Represents init state for the supported extended state. @@ -28,6 +41,44 @@ static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; /* The number of supported xfeatures in xfeatures_mask: */ static unsigned int xfeatures_nr; +/* + * Return whether the system supports a given xfeature. + * + * Also return the name of the (most advanced) feature that the caller requested: + */ +int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) +{ + u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask; + + if (unlikely(feature_name)) { + long xfeature_idx, max_idx; + u64 xfeatures_print; + /* + * So we use FLS here to be able to print the most advanced + * feature that was requested but is missing. So if a driver + * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the + * missing AVX feature - this is the most informative message + * to users: + */ + if (xfeatures_missing) + xfeatures_print = xfeatures_missing; + else + xfeatures_print = xfeatures_needed; + + xfeature_idx = fls64(xfeatures_print)-1; + max_idx = ARRAY_SIZE(xfeature_names)-1; + xfeature_idx = min(xfeature_idx, max_idx); + + *feature_name = xfeature_names[xfeature_idx]; + } + + if (xfeatures_missing) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(cpu_has_xfeatures); + /* * When executing XSAVEOPT (optimized XSAVE), if a processor implementation * detects that an FPU state component is still (or is again) in its -- cgit v1.2.1 From 33588b52228f6e94e970a05ead10f524f363ee44 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 09:17:26 +0200 Subject: x86/fpu: Simplify print_xstate_features() We do a boot time printout of xfeatures in print_xstate_features(), simplify this code to make use of the recently introduced cpu_has_xfeature() method. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2e52f01f4931..0f849229c93b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -545,13 +545,12 @@ static void __init setup_xstate_features(void) } while (1); } -static void print_xstate_feature(u64 xstate_mask, const char *desc) +static void print_xstate_feature(u64 xstate_mask) { - if (xfeatures_mask & xstate_mask) { - int xstate_feature = fls64(xstate_mask)-1; + const char *feature_name; - pr_info("x86/fpu: Supporting XSAVE feature %2d: '%s'\n", xstate_feature, desc); - } + if (cpu_has_xfeatures(xstate_mask, &feature_name)) + pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name); } /* @@ -559,14 +558,14 @@ static void print_xstate_feature(u64 xstate_mask, const char *desc) */ static void print_xstate_features(void) { - print_xstate_feature(XSTATE_FP, "x87 floating point registers"); - print_xstate_feature(XSTATE_SSE, "SSE registers"); - print_xstate_feature(XSTATE_YMM, "AVX registers"); - print_xstate_feature(XSTATE_BNDREGS, "MPX bounds registers"); - print_xstate_feature(XSTATE_BNDCSR, "MPX CSR"); - print_xstate_feature(XSTATE_OPMASK, "AVX-512 opmask"); - print_xstate_feature(XSTATE_ZMM_Hi256, "AVX-512 Hi256"); - print_xstate_feature(XSTATE_Hi16_ZMM, "AVX-512 ZMM_Hi256"); + print_xstate_feature(XSTATE_FP); + print_xstate_feature(XSTATE_SSE); + print_xstate_feature(XSTATE_YMM); + print_xstate_feature(XSTATE_BNDREGS); + print_xstate_feature(XSTATE_BNDCSR); + print_xstate_feature(XSTATE_OPMASK); + print_xstate_feature(XSTATE_ZMM_Hi256); + print_xstate_feature(XSTATE_Hi16_ZMM); } /* -- cgit v1.2.1 From 677b98bdd5b93d89ad58b8a4679db086dfbaa854 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 09:40:26 +0200 Subject: x86/fpu: Enumerate xfeature bits Transform the xstate masks into an enumerated list of xfeature bits. This removes the hard coding of XFEATURES_NR_MAX. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 2f2ed322263f..9b2869dea490 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -4,25 +4,39 @@ #include #include -#define XSTATE_CPUID 0x0000000d - -#define XSTATE_FP 0x1 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 -#define XSTATE_BNDREGS 0x8 -#define XSTATE_BNDCSR 0x10 -#define XSTATE_OPMASK 0x20 -#define XSTATE_ZMM_Hi256 0x40 -#define XSTATE_Hi16_ZMM 0x80 +/* + * List of XSAVE features Linux knows about: + */ +enum xfeature_bit { + XSTATE_BIT_FP, + XSTATE_BIT_SSE, + XSTATE_BIT_YMM, + XSTATE_BIT_BNDREGS, + XSTATE_BIT_BNDCSR, + XSTATE_BIT_OPMASK, + XSTATE_BIT_ZMM_Hi256, + XSTATE_BIT_Hi16_ZMM, + + XFEATURES_NR_MAX, +}; + +#define XSTATE_FP (1 << XSTATE_BIT_FP) +#define XSTATE_SSE (1 << XSTATE_BIT_SSE) +#define XSTATE_YMM (1 << XSTATE_BIT_YMM) +#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS) +#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR) +#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK) +#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256) +#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM) -/* The highest xstate bit above (of XSTATE_Hi16_ZMM): */ -#define XFEATURES_NR_MAX 8 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) #define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) +#define XSTATE_CPUID 0x0000000d + #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 -- cgit v1.2.1 From 91969d690f594f22c15055daf89cecd33584d65e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 09:46:04 +0200 Subject: x86/fpu: Move xfeature type enumeration to fpu/types.h So xsave.h is an internal header that FPU using drivers commonly include, to get access to the xstate feature names, amongst other things. Move these type definitions to fpu/fpu.h to allow simplification of FPU using driver code. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 28 ++++++++++++++++++++++++++++ arch/x86/include/asm/fpu/xstate.h | 28 ---------------------------- 2 files changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 3a15ac6032eb..006ec2975f6f 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -78,6 +78,34 @@ struct i387_soft_struct { u32 entry_eip; }; +/* + * List of XSAVE features Linux knows about: + */ +enum xfeature_bit { + XSTATE_BIT_FP, + XSTATE_BIT_SSE, + XSTATE_BIT_YMM, + XSTATE_BIT_BNDREGS, + XSTATE_BIT_BNDCSR, + XSTATE_BIT_OPMASK, + XSTATE_BIT_ZMM_Hi256, + XSTATE_BIT_Hi16_ZMM, + + XFEATURES_NR_MAX, +}; + +#define XSTATE_FP (1 << XSTATE_BIT_FP) +#define XSTATE_SSE (1 << XSTATE_BIT_SSE) +#define XSTATE_YMM (1 << XSTATE_BIT_YMM) +#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS) +#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR) +#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK) +#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256) +#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM) + +#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) + /* * There are 16x 256-bit AVX registers named YMM0-YMM15. * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 9b2869dea490..31a002ad5aeb 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -4,34 +4,6 @@ #include #include -/* - * List of XSAVE features Linux knows about: - */ -enum xfeature_bit { - XSTATE_BIT_FP, - XSTATE_BIT_SSE, - XSTATE_BIT_YMM, - XSTATE_BIT_BNDREGS, - XSTATE_BIT_BNDCSR, - XSTATE_BIT_OPMASK, - XSTATE_BIT_ZMM_Hi256, - XSTATE_BIT_Hi16_ZMM, - - XFEATURES_NR_MAX, -}; - -#define XSTATE_FP (1 << XSTATE_BIT_FP) -#define XSTATE_SSE (1 << XSTATE_BIT_SSE) -#define XSTATE_YMM (1 << XSTATE_BIT_YMM) -#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS) -#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR) -#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK) -#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256) -#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM) - - -#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) -#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) -- cgit v1.2.1 From ce4f5f9b65ae3d630e375b5e406feca31f682886 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 09:59:44 +0200 Subject: x86/fpu, crypto x86/camellia_aesni_avx: Simplify the camellia_aesni_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit: text data bss dec hex filename 2128 2896 0 5024 13a0 camellia_aesni_avx_glue.o.before 2067 2896 0 4963 1363 camellia_aesni_avx_glue.o.after - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx_glue.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 0122cd95563a..80a0e4389c9a 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -19,8 +19,7 @@ #include #include #include -#include -#include +#include #include #include @@ -553,16 +552,10 @@ static struct crypto_alg cmll_algs[10] = { { static int __init camellia_aesni_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { - pr_info("AVX or AES-NI instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit v1.2.1 From 70d51eb65ddf53a00e6801c2ee541a7219e5685a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/sha256_ssse3: Simplify the sha256_ssse3_mod_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/sha256_ssse3_glue.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index eb65522f02b8..f8097fc0d1d1 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -38,8 +38,6 @@ #include #include #include -#include -#include #include asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, @@ -132,15 +130,9 @@ static struct shash_alg algs[] = { { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - u64 xcr0; - - if (!cpu_has_avx || !cpu_has_osxsave) - return false; - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } -- cgit v1.2.1 From 7bc371faa9cd661589d13575cd48aa168e04e208 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/camellia_aesni_avx2: Simplify the camellia_aesni_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 46484482f267..76ea7df217e6 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -19,8 +19,7 @@ #include #include #include -#include -#include +#include #include #include @@ -561,16 +560,10 @@ static struct crypto_alg cmll_algs[10] = { { static int __init camellia_aesni_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { - pr_info("AVX2 or AES-NI instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX2 detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit v1.2.1 From 4eecd2616d6a6e91b57659281ca6bf243264f8db Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/twofish_avx: Simplify the twofish_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/twofish_avx_glue.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 95434f5b705a..c2bd0ce718ee 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -37,8 +37,6 @@ #include #include #include -#include -#include #include #include #include @@ -558,16 +556,10 @@ static struct crypto_alg twofish_algs[10] = { { static int __init twofish_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - printk(KERN_INFO "AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - printk(KERN_INFO "AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit v1.2.1 From c1c23f7e5e1ef5e72c21f0615fbed560be5b8fa9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/serpent_avx: Simplify the serpent_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/serpent_avx_glue.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index f66ae85f58fe..da7dafc9b16d 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -36,8 +36,7 @@ #include #include #include -#include -#include +#include #include #include @@ -596,16 +595,10 @@ static struct crypto_alg serpent_algs[10] = { { static int __init serpent_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - printk(KERN_INFO "AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - printk(KERN_INFO "AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit v1.2.1 From d5d34d98d2df01766854c7eb7044943b596d824f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/cast5_avx: Simplify the cast5_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/cast5_avx_glue.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index ca4f32e7a423..be00aa48b2b5 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -31,8 +31,7 @@ #include #include #include -#include -#include +#include #include #define CAST5_PARALLEL_BLOCKS 16 @@ -468,16 +467,10 @@ static struct crypto_alg cast5_algs[6] = { { static int __init cast5_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - pr_info("AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit v1.2.1 From c93b8a3963242e034a4d1ec87090c62bb25a575c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/sha512_ssse3: Simplify the sha512_ssse3_mod_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/sha512_ssse3_glue.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 78914641c72b..2edad7b81870 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -36,8 +36,6 @@ #include #include #include -#include -#include #include @@ -131,15 +129,9 @@ static struct shash_alg algs[] = { { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - u64 xcr0; - - if (!cpu_has_avx || !cpu_has_osxsave) - return false; - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } -- cgit v1.2.1 From 1debf7db2b9e156ecd69830244285b854e85a71c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/cast6_avx: Simplify the cast6_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/cast6_avx_glue.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 21d0b845c8c4..5dbba7224221 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -36,8 +36,7 @@ #include #include #include -#include -#include +#include #include #define CAST6_PARALLEL_BLOCKS 8 @@ -590,16 +589,10 @@ static struct crypto_alg cast6_algs[10] = { { static int __init cast6_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - pr_info("AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit v1.2.1 From d1e509660c690597edb7e15efd22a5e78b8e2fd5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/sha1_ssse3: Simplify the sha1_ssse3_mod_init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/sha1_ssse3_glue.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 84db12f052e8..7c48e8b20848 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -30,8 +30,6 @@ #include #include #include -#include -#include asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, @@ -123,15 +121,9 @@ static struct shash_alg alg = { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - u64 xcr0; - - if (!cpu_has_avx || !cpu_has_osxsave) - return false; - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } -- cgit v1.2.1 From 534ff06e39292b66b46d6318191b552b5bb2d1e7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:11:24 +0200 Subject: x86/fpu, crypto x86/serpent_avx2: Simplify the init() xfeature checks Use the new 'cpu_has_xfeatures()' function to query AVX CPU support. This has the following advantages to the driver: - Decouples the driver from FPU internals: it's now only using . - Removes detection complexity from the driver, no more raw XGETBV instruction - Shrinks the code a bit. - Standardizes feature name error message printouts across drivers There are also advantages to the x86 FPU code: once all drivers are decoupled from internals we can move them out of common headers and we'll also be able to remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/serpent_avx2_glue.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index aa325fa5c7a6..f226ad41fde1 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -20,8 +20,7 @@ #include #include #include -#include -#include +#include #include #include @@ -537,16 +536,10 @@ static struct crypto_alg srp_algs[10] = { { static int __init init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_osxsave) { - pr_info("AVX2 instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit v1.2.1 From 57dd083e0cd0bb1ce447941b6438bf0a8372fc19 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:59:00 +0200 Subject: x86/fpu, crypto x86/sha1_mb: Remove FPU internal headers from sha1_mb.c This file only uses the public FPU APIs, so remove the xcr.h, fpu/xstate.h and fpu/internal.h headers and add the fpu/api.h include. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/crypto/sha-mb/sha1_mb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 6f3f76568bd5..f53ed1dc88ea 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -65,10 +65,8 @@ #include #include #include -#include -#include #include -#include +#include #include "sha_mb_ctx.h" #define FLUSH_INTERVAL 1000 /* in usec */ -- cgit v1.2.1 From befc61ad3c097bb6ace3da0c73ad56272ccee02d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 10:56:54 +0200 Subject: x86/fpu: Move asm/xcr.h to asm/fpu/internal.h Now that all FPU internals using drivers are converted to public APIs, move xcr.h's definitions into fpu/internal.h and remove xcr.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 25 +++++++++++++++++++ arch/x86/include/asm/xcr.h | 49 ------------------------------------- arch/x86/kernel/fpu/xstate.c | 1 - arch/x86/kvm/vmx.c | 1 - arch/x86/kvm/x86.c | 1 - 5 files changed, 25 insertions(+), 52 deletions(-) delete mode 100644 arch/x86/include/asm/xcr.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 8ec785ecce81..161b51bf267e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -430,6 +430,31 @@ static inline void fpu_reset_state(struct fpu *fpu) restore_init_xstate(); } +/* + * Definitions for the eXtended Control Register instructions + */ + +#define XCR_XFEATURE_ENABLED_MASK 0x00000000 + +static inline u64 xgetbv(u32 index) +{ + u32 eax, edx; + + asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ + : "=a" (eax), "=d" (edx) + : "c" (index)); + return eax + ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ + u32 eax = value; + u32 edx = value >> 32; + + asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ + : : "a" (eax), "d" (edx), "c" (index)); +} + /* * FPU state switching for scheduling. * diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h deleted file mode 100644 index f2cba4e79a23..000000000000 --- a/arch/x86/include/asm/xcr.h +++ /dev/null @@ -1,49 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2008 rPath, Inc. - All Rights Reserved - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * asm-x86/xcr.h - * - * Definitions for the eXtended Control Register instructions - */ - -#ifndef _ASM_X86_XCR_H -#define _ASM_X86_XCR_H - -#define XCR_XFEATURE_ENABLED_MASK 0x00000000 - -#ifdef __KERNEL__ -# ifndef __ASSEMBLY__ - -#include - -static inline u64 xgetbv(u32 index) -{ - u32 eax, edx; - - asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ - : "=a" (eax), "=d" (edx) - : "c" (index)); - return eax + ((u64)edx << 32); -} - -static inline void xsetbv(u32 index, u64 value) -{ - u32 eax = value; - u32 edx = value >> 32; - - asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ - : : "a" (eax), "d" (edx), "c" (index)); -} - -# endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_XCR_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0f849229c93b..c087f2d0f2d1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -9,7 +9,6 @@ #include #include #include -#include static const char *xfeature_names[] = { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f93ae71416e4..2de55e953842 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ff4df77e069..5c61aae277f9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -61,7 +61,6 @@ #include #include #include /* Ugh! */ -#include #include #include -- cgit v1.2.1 From d0903193124132c6bb59a895eeb0656f86013da1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 11:11:10 +0200 Subject: x86/fpu: Rename sanitize_i387_state() to fpstate_sanitize_xstate() So the sanitize_i387_state() function has the following purpose: on CPUs that support optimized xstate saving instructions, an FPU fpstate might end up having partially uninitialized data. This function initializes that data. Note that the function name is a misnomer and confusing on two levels, not only is it not i387 specific at all, but it is the exact opposite: it only matters on xstate CPUs. So rename sanitize_i387_state() and __sanitize_i387_state() to fpstate_sanitize_xstate() and __fpstate_sanitize_xstate(), to clearly express the purpose and usage of the function. We'll further clean up this function in the next patch. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 6 +++--- arch/x86/kernel/fpu/core.c | 8 ++++---- arch/x86/kernel/fpu/xstate.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 161b51bf267e..6b6fa46037f8 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -139,13 +139,13 @@ static inline void fx_finit(struct i387_fxsave_struct *fx) fx->mxcsr = MXCSR_DEFAULT; } -extern void __sanitize_i387_state(struct task_struct *); +extern void __fpstate_sanitize_xstate(struct task_struct *); -static inline void sanitize_i387_state(struct task_struct *tsk) +static inline void fpstate_sanitize_xstate(struct task_struct *tsk) { if (!use_xsaveopt()) return; - __sanitize_i387_state(tsk); + __fpstate_sanitize_xstate(tsk); } #define user_insn(insn, output, input...) \ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index edbb5d04a558..561a3532abc2 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -395,7 +395,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, return -ENODEV; fpu__activate_stopped(fpu); - sanitize_i387_state(target); + fpstate_sanitize_xstate(target); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpu->state.fxsave, 0, -1); @@ -412,7 +412,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, return -ENODEV; fpu__activate_stopped(fpu); - sanitize_i387_state(target); + fpstate_sanitize_xstate(target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpu->state.fxsave, 0, -1); @@ -644,7 +644,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, &fpu->state.fsave, 0, -1); - sanitize_i387_state(target); + fpstate_sanitize_xstate(target); if (kbuf && pos == 0 && count == sizeof(env)) { convert_from_fxsr(kbuf, target); @@ -666,7 +666,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_stopped(fpu); - sanitize_i387_state(target); + fpstate_sanitize_xstate(target); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c087f2d0f2d1..fc2ff1239fea 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(cpu_has_xfeatures); * if the corresponding header bit is zero. This is to ensure that user-space doesn't * see some stale state in the memory layout during signal handling, debugging etc. */ -void __sanitize_i387_state(struct task_struct *tsk) +void __fpstate_sanitize_xstate(struct task_struct *tsk) { struct i387_fxsave_struct *fx = &tsk->thread.fpu.state.fxsave; int feature_bit; @@ -318,7 +318,7 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (ia32_fxstate) fpu_fxsave(&tsk->thread.fpu); } else { - sanitize_i387_state(tsk); + fpstate_sanitize_xstate(tsk); if (__copy_to_user(buf_fx, xsave, xstate_size)) return -1; } -- cgit v1.2.1 From 1ac91a767f1d2ac049dc11e5b7e4342c63c21538 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 11:17:55 +0200 Subject: x86/fpu: Simplify fpstate_sanitize_xstate() calls Remove the extra layer of __fpstate_sanitize_xstate(): if (!use_xsaveopt()) return; __fpstate_sanitize_xstate(tsk); and move the check for use_xsaveopt() into fpstate_sanitize_xstate(). In general we optimize for the presence of CPU features, not for the absence of them. Furthermore there's little point in this inlining, as the call sites are not super hot code paths. Doing this uninlining shrinks the code a bit: text data bss dec hex filename 14108751 2573624 1634304 18316679 1177d87 vmlinux.before 14108627 2573624 1634304 18316555 1177d0b vmlinux.after Also remove a pointless '!fx' check from fpstate_sanitize_xstate(). Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 9 +-------- arch/x86/kernel/fpu/xstate.c | 4 ++-- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 6b6fa46037f8..88fec3f108de 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -139,14 +139,7 @@ static inline void fx_finit(struct i387_fxsave_struct *fx) fx->mxcsr = MXCSR_DEFAULT; } -extern void __fpstate_sanitize_xstate(struct task_struct *); - -static inline void fpstate_sanitize_xstate(struct task_struct *tsk) -{ - if (!use_xsaveopt()) - return; - __fpstate_sanitize_xstate(tsk); -} +extern void fpstate_sanitize_xstate(struct task_struct *); #define user_insn(insn, output, input...) \ ({ \ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index fc2ff1239fea..47b9591947e1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -92,13 +92,13 @@ EXPORT_SYMBOL_GPL(cpu_has_xfeatures); * if the corresponding header bit is zero. This is to ensure that user-space doesn't * see some stale state in the memory layout during signal handling, debugging etc. */ -void __fpstate_sanitize_xstate(struct task_struct *tsk) +void fpstate_sanitize_xstate(struct task_struct *tsk) { struct i387_fxsave_struct *fx = &tsk->thread.fpu.state.fxsave; int feature_bit; u64 xfeatures; - if (!fx) + if (!use_xsaveopt()) return; xfeatures = tsk->thread.fpu.state.xsave.header.xfeatures; -- cgit v1.2.1 From 36e49e7f2ec8fa2cdf1ec0439374583ea0a82c47 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 11:25:02 +0200 Subject: x86/fpu: Pass 'struct fpu' to fpstate_sanitize_xstate() Currently fpstate_sanitize_xstate() has a task_struct input parameter, but it only uses the fpu structure from it - so pass in a 'struct fpu' pointer only and update all call sites. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 9 ++++----- arch/x86/kernel/fpu/xstate.c | 8 ++++---- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 88fec3f108de..da96b0cbfcb3 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -139,7 +139,7 @@ static inline void fx_finit(struct i387_fxsave_struct *fx) fx->mxcsr = MXCSR_DEFAULT; } -extern void fpstate_sanitize_xstate(struct task_struct *); +extern void fpstate_sanitize_xstate(struct fpu *fpu); #define user_insn(insn, output, input...) \ ({ \ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 561a3532abc2..1d6e97c59a72 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -395,7 +395,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, return -ENODEV; fpu__activate_stopped(fpu); - fpstate_sanitize_xstate(target); + fpstate_sanitize_xstate(fpu); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpu->state.fxsave, 0, -1); @@ -412,7 +412,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, return -ENODEV; fpu__activate_stopped(fpu); - fpstate_sanitize_xstate(target); + fpstate_sanitize_xstate(fpu); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpu->state.fxsave, 0, -1); @@ -644,7 +644,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, &fpu->state.fsave, 0, -1); - fpstate_sanitize_xstate(target); + fpstate_sanitize_xstate(fpu); if (kbuf && pos == 0 && count == sizeof(env)) { convert_from_fxsr(kbuf, target); @@ -665,8 +665,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, int ret; fpu__activate_stopped(fpu); - - fpstate_sanitize_xstate(target); + fpstate_sanitize_xstate(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 47b9591947e1..a8ce38a9d70b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -92,16 +92,16 @@ EXPORT_SYMBOL_GPL(cpu_has_xfeatures); * if the corresponding header bit is zero. This is to ensure that user-space doesn't * see some stale state in the memory layout during signal handling, debugging etc. */ -void fpstate_sanitize_xstate(struct task_struct *tsk) +void fpstate_sanitize_xstate(struct fpu *fpu) { - struct i387_fxsave_struct *fx = &tsk->thread.fpu.state.fxsave; + struct i387_fxsave_struct *fx = &fpu->state.fxsave; int feature_bit; u64 xfeatures; if (!use_xsaveopt()) return; - xfeatures = tsk->thread.fpu.state.xsave.header.xfeatures; + xfeatures = fpu->state.xsave.header.xfeatures; /* * None of the feature bits are in init state. So nothing else @@ -318,7 +318,7 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (ia32_fxstate) fpu_fxsave(&tsk->thread.fpu); } else { - fpstate_sanitize_xstate(tsk); + fpstate_sanitize_xstate(&tsk->thread.fpu); if (__copy_to_user(buf_fx, xsave, xstate_size)) return -1; } -- cgit v1.2.1 From c8e1404120d55876d2dedd3a541bc484ef692c58 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 11:35:20 +0200 Subject: x86/fpu: Rename save_xstate_sig() to copy_fpstate_to_sigframe() Standardize the naming of save_xstate_sig() by renaming it to copy_fpstate_to_sigframe(): this tells us at a glance that the function copies an FPU fpstate to a signal frame. This naming also follows the naming of copy_fpregs_to_fpstate(). Don't put 'xstate' into the name: since this is a generic name, it's expected that the function is able to handle xstate frames as well, beyond legacy frames. xstate used to be the odd case in the x86 FPU code - now it's the common case. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/xstate.c | 2 +- arch/x86/kernel/signal.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index d6d8f4ca5136..2e0b1b7842ae 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -327,7 +327,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); *fpstate = (struct _fpstate_ia32 __user *) sp; - if (save_xstate_sig(*fpstate, (void __user *)fx_aligned, + if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, math_size) < 0) return (void __user *) -1L; } diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index da96b0cbfcb3..58c274dfcb62 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -523,7 +523,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switc /* * Signal frame handlers... */ -extern int save_xstate_sig(void __user *buf, void __user *fx, int size); +extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fx, int size); extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); static inline int xstate_sigframe_size(void) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a8ce38a9d70b..5178158a03ff 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -293,7 +293,7 @@ static inline int save_user_xstate(struct xsave_struct __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) +int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct xsave_struct *xsave = ¤t->thread.fpu.state.xsave; struct task_struct *tsk = current; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index c67f96c87938..59cfc9c97491 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -235,7 +235,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, /* save i387 and extended state */ if (fpu->fpstate_active && - save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0) + copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; return (void __user *)sp; -- cgit v1.2.1 From 2a52af8b8adc498b3cccbc219ec997cad6cafb9c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 12:09:27 +0200 Subject: x86/fpu: Rename save_user_xstate() to copy_fpregs_to_sigframe() Move the naming in line with existing names, so that we now have: copy_fpregs_to_fpstate() copy_fpstate_to_sigframe() copy_fpregs_to_sigframe() ... where each function does what its name suggests. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5178158a03ff..28638820ed0e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -257,7 +257,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) return err; } -static inline int save_user_xstate(struct xsave_struct __user *buf) +static inline int copy_fpregs_to_sigframe(struct xsave_struct __user *buf) { int err; @@ -312,7 +312,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (user_has_fpu()) { /* Save the live register state to the user directly. */ - if (save_user_xstate(buf_fx)) + if (copy_fpregs_to_sigframe(buf_fx)) return -1; /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) -- cgit v1.2.1 From be7436d519970365775d3d2d7b73e75c233e1994 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 12:53:45 +0200 Subject: x86/fpu: Clarify ancient comments in fpu__restore() So this function still had ancient language about 'saving current math information' - but we haven't been doing lazy FPU saves for quite some time, we are doing lazy FPU restores. Also remove IRQ13 related comment, which we don't support anymore either. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1d6e97c59a72..91b9935021c4 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -319,14 +319,14 @@ static void fpu__activate_stopped(struct fpu *child_fpu) } /* - * 'fpu__restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task + * 'fpu__restore()' is called to copy FPU registers from + * the FPU fpstate to the live hw registers and to activate + * access to the hardware registers, so that FPU instructions + * can be used afterwards. * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (eg with local - * local interrupts as in the case of do_device_not_available). + * Must be called with kernel preemption disabled (for example + * with local interrupts disabled, as it is in the case of + * do_device_not_available()). */ void fpu__restore(void) { -- cgit v1.2.1 From 3c6dffa93be9f82f2566dcc948285d6f79fb9ce2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Apr 2015 12:28:08 +0200 Subject: x86/fpu: Rename user_has_fpu() to fpregs_active() Rename this function in line with the new FPU nomenclature. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/kernel/fpu/xstate.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 58c274dfcb62..0f17cd4e4e58 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -358,7 +358,7 @@ static inline void __fpregs_activate(struct fpu *fpu) * to save the FP state - we'll just take a #NM * fault and get the FPU access back. */ -static inline int user_has_fpu(void) +static inline int fpregs_active(void) { return current->thread.fpu.fpregs_active; } @@ -557,7 +557,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!user_has_fpu()) + if (!fpregs_active()) fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 28638820ed0e..b8e5fee2aef3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -310,7 +310,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_ia32 __user *) buf) ? -1 : 1; - if (user_has_fpu()) { + if (fpregs_active()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2de55e953842..1c384bf856e5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1882,7 +1882,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. */ - if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) + if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) stts(); load_gdt(this_cpu_ptr(&host_gdt)); } -- cgit v1.2.1 From b1276c48e91bee869454301d3678cc49d8f57ab4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2015 10:58:03 +0200 Subject: x86/fpu: Initialize fpregs in fpu__init_cpu_generic() FPU fpregs do not get initialized during bootup on secondary CPUs, on non-xsave capable CPUs. For example on one of my systems, the secondary CPU has this FPU state on bootup: x86: Booting SMP configuration: .... node #0, CPUs: #1 x86/fpu ###################### x86/fpu # FPU register dump on CPU#1: x86/fpu # ... CWD: ffff0040 x86/fpu # ... SWD: ffff0000 x86/fpu # ... TWD: ffff555a x86/fpu # ... FIP: 00000000 x86/fpu # ... FCS: 00000000 x86/fpu # ... FOO: 00000000 x86/fpu # ... FOS: ffff0000 x86/fpu # ... FP0: 02 57 00 00 00 00 00 00 ff ff x86/fpu # ... FP1: 1b e2 00 00 00 00 00 00 ff ff x86/fpu # ... FP2: 00 00 00 00 00 00 00 00 00 00 x86/fpu # ... FP3: 00 00 00 00 00 00 00 00 00 00 x86/fpu # ... FP4: 00 00 00 00 00 00 00 00 00 00 x86/fpu # ... FP5: 00 00 00 00 00 00 00 00 00 00 x86/fpu # ... FP6: 00 00 00 00 00 00 00 00 00 00 x86/fpu # ... FP7: 00 00 00 00 00 00 00 00 00 00 x86/fpu # ... SW: dadadada x86/fpu ###################### Note how CWD and TWD are off their usual init state (0x037f and 0xffff), and how FP0 and FP1 has non-zero content. This is normally not a problem, because any user-space FPU state is initalized properly - but it can complicate the use of FPU instructions in kernel code via kernel_fpu_begin()/end(): if the FPU using code does not initialize registers itself, it might generate spurious exceptions depending on which CPU it executes on. Fix this by initializing the x87 state via the FNINIT instruction. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 460e7e2c6186..72219ce2385a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -36,6 +36,9 @@ static void fpu__init_cpu_generic(void) if (!cpu_has_fpu) cr0 |= X86_CR0_EM; write_cr0(cr0); + + /* Flush out any pending x87 state: */ + asm volatile ("fninit"); } /* -- cgit v1.2.1 From 2e85591a6ca2ad84741d2859753be5497d74bd42 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2015 08:46:26 +0200 Subject: x86/fpu: Better document fpu__clear() state handling So prior to this fix: c88d47480d30 ("x86/fpu: Always restore_xinit_state() when use_eager_cpu()") we leaked FPU state across execve() boundaries on eagerfpu systems: $ /host/home/mingo/dump-xmm-regs-exec # XMM state before execve(): XMM0 : 000000000000dede XMM1 : 000000000000dedf XMM2 : 000000000000dee0 XMM3 : 000000000000dee1 XMM4 : 000000000000dee2 XMM5 : 000000000000dee3 XMM6 : 000000000000dee4 XMM7 : 000000000000dee5 XMM8 : 000000000000dee6 XMM9 : 000000000000dee7 XMM10: 000000000000dee8 XMM11: 000000000000dee9 XMM12: 000000000000deea XMM13: 000000000000deeb XMM14: 000000000000deec XMM15: 000000000000deed # XMM state after execve(), in the new task context: XMM0 : 0000000000000000 XMM1 : 2f2f2f2f2f2f2f2f XMM2 : 0000000000000000 XMM3 : 0000000000000000 XMM4 : 00000000000000ff XMM5 : 00000000ff000000 XMM6 : 000000000000dee4 XMM7 : 000000000000dee5 XMM8 : 0000000000000000 XMM9 : 0000000000000000 XMM10: 0000000000000000 XMM11: 0000000000000000 XMM12: 0000000000000000 XMM13: 000000000000deeb XMM14: 000000000000deec XMM15: 000000000000deed Better explain what this function is supposed to do and why. Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 91b9935021c4..a2e2da2b08c5 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -348,6 +348,10 @@ void fpu__restore(void) } EXPORT_SYMBOL_GPL(fpu__restore); +/* + * Called by sys_execve() to clear the FPU fpregs, so that FPU state + * of the previous binary does not leak over into the exec()ed binary: + */ void fpu__clear(struct task_struct *tsk) { struct fpu *fpu = &tsk->thread.fpu; -- cgit v1.2.1 From 5e907bb0459399b0d1cc8d4c7e9f363a995b748a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 09:09:26 +0200 Subject: x86/alternatives, x86/fpu: Add 'alternatives_patched' debug flag and use it in xsave_state() We'd like to use xsave_state() earlier, but its SYSTEM_BOOTING check is too imprecise. The real condition that xsave_state() would like to check is whether alternative XSAVE instructions were patched into the kernel image already. Add such a (read-mostly) debug flag and use it in xsave_state(). Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 6 ++++++ arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/alternative.c | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ba32af062f61..7bfc85bbb8ff 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -52,6 +52,12 @@ struct alt_instr { u8 padlen; /* length of build-time padding */ } __packed; +/* + * Debug flag that can be tested to see whether alternative + * instructions were patched in already: + */ +extern int alternatives_patched; + extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 31a002ad5aeb..ab2c507b58b6 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -119,7 +119,7 @@ static inline int xsave_state(struct xsave_struct *fx) u32 hmask = mask >> 32; int err = 0; - WARN_ON(system_state == SYSTEM_BOOTING); + WARN_ON(!alternatives_patched); /* * If xsaves is enabled, xsaves replaces xsaveopt because diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index aef653193160..7fe097235376 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -21,6 +21,10 @@ #include #include +int __read_mostly alternatives_patched; + +EXPORT_SYMBOL_GPL(alternatives_patched); + #define MAX_PATCH_LEN (255-1) static int __initdata_or_module debug_alternative; @@ -627,6 +631,7 @@ void __init alternative_instructions(void) apply_paravirt(__parainstructions, __parainstructions_end); restart_nmi(); + alternatives_patched = 1; } /** -- cgit v1.2.1 From 5033861575df08a04090cc7b785b2b7aadcbde82 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2015 19:04:31 +0200 Subject: x86/fpu: Synchronize the naming of drop_fpu() and fpu_reset_state() drop_fpu() and fpu_reset_state() are similar in functionality and in scope, yet this is not apparent from their names. drop_fpu() deactivates FPU contents (both the fpregs and the fpstate), but leaves register contents intact in the eager-FPU case, mostly as an optimization. It disables fpregs in the lazy FPU case. The drop_fpu() method can be used to destroy FPU state in an optimized way, when we know that a new state will be loaded before user-space might see any remains of the old FPU state: - such as in sys_exit()'s exit_thread() where we know this task won't execute any user-space instructions anymore and the next context switch cleans up the FPU. The old FPU state might still be around in the eagerfpu case but won't be saved. - in __restore_xstate_sig(), where we use drop_fpu() before copying a new state into the fpstate and activating that one. No user-pace instructions can execute between those steps. - in sys_execve()'s fpu__clear(): there we use drop_fpu() in the !eagerfpu case, where it's equivalent to a full reinit. fpu_reset_state() is a stronger version of drop_fpu(): both in the eagerfpu and the lazy-FPU case it guarantees that fpregs are reinitialized to init state. This method is used in cases where we need a full reset: - handle_signal() uses fpu_reset_state() to reset the FPU state to init before executing a user-space signal handler. While we have already saved the original FPU state at this point, and always restore the original state, the signal handling code still has to do this reinit, because signals may interrupt any user-space instruction, and the FPU might be in various intermediate states (such as an unbalanced x87 stack) that is not immediately usable for general C signal handler code. - __restore_xstate_sig() uses fpu_reset_state() when the signal frame has no FP context. Since the signal handler may have modified the FPU state, it gets reset back to init state. - in another branch __restore_xstate_sig() uses fpu_reset_state() to handle a restoration error: when restore_user_xstate() fails to restore FPU state and we might have inconsistent FPU data, fpu_reset_state() is used to reset it back to a known good state. - __kernel_fpu_end() uses fpu_reset_state() in an error branch. This is in a 'must not trigger' error branch, so on bug-free kernels this never triggers. - fpu__restore() uses fpu_reset_state() in an error path as well: if the fpstate was set up with invalid FPU state (via ptrace or via a signal handler), then it's reset back to init state. - likewise, the scheduler's switch_fpu_finish() uses it in a restoration error path too. Move both drop_fpu() and fpu_reset_state() to the fpu__*() namespace and harmonize their naming with their function: fpu__drop() fpu__reset() This clearly shows that both methods operate on the full state of the FPU, just like fpu__restore(). Also add comments to explain what each function does. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 23 ++++++++++++++--------- arch/x86/kernel/fpu/core.c | 6 +++--- arch/x86/kernel/fpu/xstate.c | 6 +++--- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/signal.c | 2 +- 5 files changed, 22 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0f17cd4e4e58..31bfda818f30 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -382,11 +382,17 @@ static inline void fpregs_deactivate(struct fpu *fpu) __fpregs_deactivate_hw(); } -static inline void drop_fpu(struct fpu *fpu) +/* + * Drops current FPU state: deactivates the fpregs and + * the fpstate. NOTE: it still leaves previous contents + * in the fpregs in the eager-FPU case. + * + * This function can be used in cases where we know that + * a state-restore is coming: either an explicit one, + * or a reschedule. + */ +static inline void fpu__drop(struct fpu *fpu) { - /* - * Forget coprocessor state.. - */ preempt_disable(); fpu->counter = 0; @@ -412,13 +418,12 @@ static inline void restore_init_xstate(void) } /* - * Reset the FPU state in the eager case and drop it in the lazy case (later use - * will reinit it). + * Reset the FPU state back to init state. */ -static inline void fpu_reset_state(struct fpu *fpu) +static inline void fpu__reset(struct fpu *fpu) { if (!use_eager_fpu()) - drop_fpu(fpu); + fpu__drop(fpu); else restore_init_xstate(); } @@ -516,7 +521,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switc { if (fpu_switch.preload) { if (unlikely(restore_fpu_checking(new_fpu))) - fpu_reset_state(new_fpu); + fpu__reset(new_fpu); } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a2e2da2b08c5..bf217cde114d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) if (fpu->fpregs_active) { if (WARN_ON(restore_fpu_checking(fpu))) - fpu_reset_state(fpu); + fpu__reset(fpu); } else { __fpregs_deactivate_hw(); } @@ -339,7 +339,7 @@ void fpu__restore(void) kernel_fpu_disable(); fpregs_activate(fpu); if (unlikely(restore_fpu_checking(fpu))) { - fpu_reset_state(fpu); + fpu__reset(fpu); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); } else { tsk->thread.fpu.counter++; @@ -360,7 +360,7 @@ void fpu__clear(struct task_struct *tsk) if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ - drop_fpu(fpu); + fpu__drop(fpu); } else { if (!fpu->fpstate_active) { fpu__activate_curr(fpu); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b8e5fee2aef3..5e3d9242bb95 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -401,7 +401,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) config_enabled(CONFIG_IA32_EMULATION)); if (!buf) { - fpu_reset_state(fpu); + fpu__reset(fpu); return 0; } @@ -449,7 +449,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) * We will be ready to restore/save the state only after * fpu->fpstate_active is again set. */ - drop_fpu(fpu); + fpu__drop(fpu); if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { @@ -474,7 +474,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ user_fpu_begin(); if (restore_user_xstate(buf_fx, xfeatures, fx_only)) { - fpu_reset_state(fpu); + fpu__reset(fpu); return -1; } } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5d37c26fa89f..dde263fb2031 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -110,7 +110,7 @@ void exit_thread(void) kfree(bp); } - drop_fpu(fpu); + fpu__drop(fpu); } void flush_thread(void) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 59cfc9c97491..6bf512390536 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -667,7 +667,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * Ensure the signal handler starts with the new fpu state. */ if (fpu->fpstate_active) - fpu_reset_state(fpu); + fpu__reset(fpu); } signal_setup_done(failed, ksig, stepping); } -- cgit v1.2.1 From 0e75c54f1703e83e6cdf239491bf7294f6c34777 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2015 20:10:43 +0200 Subject: x86/fpu: Rename restore_fpu_checking() to copy_fpstate_to_fpregs() fpu_restore_checking() is a helper function of restore_fpu_checking(), but this is not apparent from the naming. Both copy fpstate contents to fpregs, while the fuller variant does a full copy without leaking information. So rename them to: copy_fpstate_to_fpregs() __copy_fpstate_to_fpregs() Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 31bfda818f30..c09aea145e09 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -289,7 +289,7 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) extern void fpu__save(struct fpu *fpu); -static inline int fpu_restore_checking(struct fpu *fpu) +static inline int __copy_fpstate_to_fpregs(struct fpu *fpu) { if (use_xsave()) return fpu_xrstor_checking(&fpu->state.xsave); @@ -299,7 +299,7 @@ static inline int fpu_restore_checking(struct fpu *fpu) return frstor_checking(&fpu->state.fsave); } -static inline int restore_fpu_checking(struct fpu *fpu) +static inline int copy_fpstate_to_fpregs(struct fpu *fpu) { /* * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is @@ -314,7 +314,7 @@ static inline int restore_fpu_checking(struct fpu *fpu) : : [addr] "m" (fpu->fpregs_active)); } - return fpu_restore_checking(fpu); + return __copy_fpstate_to_fpregs(fpu); } /* @@ -520,7 +520,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) { if (fpu_switch.preload) { - if (unlikely(restore_fpu_checking(new_fpu))) + if (unlikely(copy_fpstate_to_fpregs(new_fpu))) fpu__reset(new_fpu); } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index bf217cde114d..14d8e33d9fe0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -115,7 +115,7 @@ void __kernel_fpu_end(void) struct fpu *fpu = ¤t->thread.fpu; if (fpu->fpregs_active) { - if (WARN_ON(restore_fpu_checking(fpu))) + if (WARN_ON(copy_fpstate_to_fpregs(fpu))) fpu__reset(fpu); } else { __fpregs_deactivate_hw(); @@ -338,7 +338,7 @@ void fpu__restore(void) /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); fpregs_activate(fpu); - if (unlikely(restore_fpu_checking(fpu))) { + if (unlikely(copy_fpstate_to_fpregs(fpu))) { fpu__reset(fpu); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); } else { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5c61aae277f9..f4438179398b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7030,7 +7030,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); - fpu_restore_checking(&vcpu->arch.guest_fpu); + __copy_fpstate_to_fpregs(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } -- cgit v1.2.1 From 6ffc152e4606c7f9149940d36a83e786f7f0a4f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2015 20:24:14 +0200 Subject: x86/fpu: Move all the fpu__*() high level methods closer to each other The fpu__*() methods are closely related, but they are defined in scattered places within the FPU code. Concentrate them, and also uninline fpu__save(), fpu__drop() and fpu__reset() to save about 5K of kernel text on 64-bit kernels: text data bss dec filename 14113063 2575280 1634304 18322647 vmlinux.before 14108070 2575280 1634304 18317654 vmlinux.after Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 53 +++++++------------------------------ arch/x86/kernel/fpu/core.c | 38 ++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index c09aea145e09..f20a0030f6a1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -46,10 +46,19 @@ extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__activate_curr(struct fpu *fpu); extern void fpstate_init(struct fpu *fpu); -extern void fpu__clear(struct task_struct *tsk); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); + +/* + * High level FPU state handling functions: + */ +extern void fpu__save(struct fpu *fpu); extern void fpu__restore(void); +extern void fpu__drop(struct fpu *fpu); +extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); +extern void fpu__reset(struct fpu *fpu); +extern void fpu__clear(struct task_struct *tsk); + extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); @@ -287,8 +296,6 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -extern void fpu__save(struct fpu *fpu); - static inline int __copy_fpstate_to_fpregs(struct fpu *fpu) { if (use_xsave()) @@ -382,33 +389,6 @@ static inline void fpregs_deactivate(struct fpu *fpu) __fpregs_deactivate_hw(); } -/* - * Drops current FPU state: deactivates the fpregs and - * the fpstate. NOTE: it still leaves previous contents - * in the fpregs in the eager-FPU case. - * - * This function can be used in cases where we know that - * a state-restore is coming: either an explicit one, - * or a reschedule. - */ -static inline void fpu__drop(struct fpu *fpu) -{ - preempt_disable(); - fpu->counter = 0; - - if (fpu->fpregs_active) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); - } - - fpu->fpstate_active = 0; - - preempt_enable(); -} - static inline void restore_init_xstate(void) { if (use_xsave()) @@ -417,17 +397,6 @@ static inline void restore_init_xstate(void) fxrstor_checking(&init_xstate_ctx.i387); } -/* - * Reset the FPU state back to init state. - */ -static inline void fpu__reset(struct fpu *fpu) -{ - if (!use_eager_fpu()) - fpu__drop(fpu); - else - restore_init_xstate(); -} - /* * Definitions for the eXtended Control Register instructions */ @@ -597,8 +566,6 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } -extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); - static inline unsigned long alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 14d8e33d9fe0..acca83be23f0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -348,6 +348,44 @@ void fpu__restore(void) } EXPORT_SYMBOL_GPL(fpu__restore); +/* + * Drops current FPU state: deactivates the fpregs and + * the fpstate. NOTE: it still leaves previous contents + * in the fpregs in the eager-FPU case. + * + * This function can be used in cases where we know that + * a state-restore is coming: either an explicit one, + * or a reschedule. + */ +void fpu__drop(struct fpu *fpu) +{ + preempt_disable(); + fpu->counter = 0; + + if (fpu->fpregs_active) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); + } + + fpu->fpstate_active = 0; + + preempt_enable(); +} + +/* + * Reset the FPU state back to init state: + */ +void fpu__reset(struct fpu *fpu) +{ + if (!use_eager_fpu()) + fpu__drop(fpu); + else + restore_init_xstate(); +} + /* * Called by sys_execve() to clear the FPU fpregs, so that FPU state * of the previous binary does not leak over into the exec()ed binary: -- cgit v1.2.1 From 04c8e01d50ae1f2b33a46459e8b1e776b747e97d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2015 20:35:33 +0200 Subject: x86/fpu: Move fpu__clear() to 'struct fpu *' parameter passing Do it like all other high level FPU state handling functions: they only know about struct fpu, not about the task. (Also remove a dead prototype while at it.) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 5 ++--- arch/x86/kernel/fpu/core.c | 6 ++---- arch/x86/kernel/process.c | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index f20a0030f6a1..d6ac4611f05e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -37,9 +37,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, #define MXCSR_DEFAULT 0x1f80 extern unsigned int mxcsr_feature_mask; -extern void fpu__init_cpu(void); -extern void eager_fpu_init(void); +extern void fpu__init_cpu(void); extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); @@ -57,7 +56,7 @@ extern void fpu__restore(void); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); extern void fpu__reset(struct fpu *fpu); -extern void fpu__clear(struct task_struct *tsk); +extern void fpu__clear(struct fpu *fpu); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index acca83be23f0..0ccdd8348872 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -390,11 +390,9 @@ void fpu__reset(struct fpu *fpu) * Called by sys_execve() to clear the FPU fpregs, so that FPU state * of the previous binary does not leak over into the exec()ed binary: */ -void fpu__clear(struct task_struct *tsk) +void fpu__clear(struct fpu *fpu) { - struct fpu *fpu = &tsk->thread.fpu; - - WARN_ON_ONCE(tsk != current); /* Almost certainly an anomaly */ + WARN_ON_ONCE(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index dde263fb2031..b53f6c939bc2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -120,7 +120,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - fpu__clear(tsk); + fpu__clear(&tsk->thread.fpu); } static void hard_disable_TSC(void) -- cgit v1.2.1 From 9dfe99b755a263f9f6ac1dfdb5512bd6e22c28e8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2015 20:55:19 +0200 Subject: x86/fpu: Rename restore_xstate_sig() to fpu__restore_sig() restore_xstate_sig() is a misnomer: it's not limited to 'xstate' at all, it is the high level 'restore FPU state from a signal frame' function that works with all legacy FPU formats as well. Rename it (and its helper) accordingly, and also move it to the fpu__*() namespace. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/internal.h | 6 +++--- arch/x86/kernel/fpu/xstate.c | 2 +- arch/x86/kernel/signal.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 2e0b1b7842ae..de81ddf3b0b2 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -197,7 +197,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); } get_user_catch(err); - err |= restore_xstate_sig(buf, 1); + err |= fpu__restore_sig(buf, 1); force_iret(); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d6ac4611f05e..d6cfbdafbab2 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -497,14 +497,14 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switc * Signal frame handlers... */ extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fx, int size); -extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); +extern int __fpu__restore_sig(void __user *buf, void __user *fx, int size); static inline int xstate_sigframe_size(void) { return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; } -static inline int restore_xstate_sig(void __user *buf, int ia32_frame) +static inline int fpu__restore_sig(void __user *buf, int ia32_frame) { void __user *buf_fx = buf; int size = xstate_sigframe_size(); @@ -514,7 +514,7 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame) size += sizeof(struct i387_fsave_struct); } - return __restore_xstate_sig(buf, buf_fx, size); + return __fpu__restore_sig(buf, buf_fx, size); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5e3d9242bb95..ea514d6a34e8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -388,7 +388,7 @@ static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) return frstor_user(buf); } -int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) +int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) { int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 6bf512390536..7416fa86f3c7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -102,7 +102,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) get_user_ex(buf, &sc->fpstate); } get_user_catch(err); - err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); + err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32)); force_iret(); -- cgit v1.2.1 From 82c0e45eb5c839a8cee3d8d8a82d0592c4f42773 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2015 21:09:18 +0200 Subject: x86/fpu: Move the signal frame handling code closer to each other Consolidate more signal frame related functions: text data bss dec filename 14108070 2575280 1634304 18317654 vmlinux.before 14107944 2575344 1634304 18317592 vmlinux.after Also, while moving it, rename alloc_mathframe() to fpu__alloc_mathframe(). Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/internal.h | 38 ++++-------------------------------- arch/x86/kernel/fpu/xstate.c | 39 ++++++++++++++++++++++++++++++++++++- arch/x86/kernel/signal.c | 4 ++-- 4 files changed, 45 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index de81ddf3b0b2..54605eb1631f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -325,7 +325,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, if (fpu->fpstate_active) { unsigned long fx_aligned, math_size; - sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); + sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); *fpstate = (struct _fpstate_ia32 __user *) sp; if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, math_size) < 0) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d6cfbdafbab2..34fbf95bbe14 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -53,6 +53,7 @@ extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); */ extern void fpu__save(struct fpu *fpu); extern void fpu__restore(void); +extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); extern void fpu__reset(struct fpu *fpu); @@ -497,25 +498,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switc * Signal frame handlers... */ extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fx, int size); -extern int __fpu__restore_sig(void __user *buf, void __user *fx, int size); - -static inline int xstate_sigframe_size(void) -{ - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; -} - -static inline int fpu__restore_sig(void __user *buf, int ia32_frame) -{ - void __user *buf_fx = buf; - int size = xstate_sigframe_size(); - - if (ia32_frame && use_fxsr()) { - buf_fx = buf + sizeof(struct i387_fsave_struct); - size += sizeof(struct i387_fsave_struct); - } - - return __fpu__restore_sig(buf, buf_fx, size); -} /* * Needs to be preemption-safe. @@ -565,20 +547,8 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } -static inline unsigned long -alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, - unsigned long *size) -{ - unsigned long frame_size = xstate_sigframe_size(); - - *buf_fx = sp = round_down(sp - frame_size, 64); - if (ia32_frame && use_fxsr()) { - frame_size += sizeof(struct i387_fsave_struct); - sp -= sizeof(struct i387_fsave_struct); - } - - *size = frame_size; - return sp; -} +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size); #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index ea514d6a34e8..810f080fadf3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -388,7 +388,7 @@ static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) return frstor_user(buf); } -int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) +static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) { int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; @@ -482,6 +482,43 @@ int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) return 0; } +static inline int xstate_sigframe_size(void) +{ + return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; +} + +/* + * Restore FPU state from a sigframe: + */ +int fpu__restore_sig(void __user *buf, int ia32_frame) +{ + void __user *buf_fx = buf; + int size = xstate_sigframe_size(); + + if (ia32_frame && use_fxsr()) { + buf_fx = buf + sizeof(struct i387_fsave_struct); + size += sizeof(struct i387_fsave_struct); + } + + return __fpu__restore_sig(buf, buf_fx, size); +} + +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size) +{ + unsigned long frame_size = xstate_sigframe_size(); + + *buf_fx = sp = round_down(sp - frame_size, 64); + if (ia32_frame && use_fxsr()) { + frame_size += sizeof(struct i387_fsave_struct); + sp -= sizeof(struct i387_fsave_struct); + } + + *size = frame_size; + + return sp; +} /* * Prepare the SW reserved portion of the fxsave memory layout, indicating * the presence of the extended state information in the memory layout diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 7416fa86f3c7..9554ca69a84e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -219,8 +219,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } if (fpu->fpstate_active) { - sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), - &buf_fx, &math_size); + sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + &buf_fx, &math_size); *fpstate = (void __user *)sp; } -- cgit v1.2.1 From fbce7782467553d09cfde39473d23bde4ad78270 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 07:12:46 +0200 Subject: x86/fpu: Merge fpu__reset() and fpu__clear() With recent cleanups and fixes the fpu__reset() and fpu__clear() functions have become almost identical in functionality: the only difference is that fpu__reset() assumed that the fpstate was already active in the eagerfpu case, while fpu__clear() activated it if it was inactive. This distinction almost never matters, the only case where such fpstate activation happens if if the init thread (PID 1) gets exec()-ed for the first time. So keep fpu__clear() and change all fpu__reset() uses to fpu__clear() to simpify the logic. ( In a later patch we'll further simplify fpu__clear() by making sure that all contexts it is called on are already active. ) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 3 +-- arch/x86/kernel/fpu/core.c | 21 ++++++--------------- arch/x86/kernel/fpu/xstate.c | 4 ++-- arch/x86/kernel/signal.c | 2 +- 4 files changed, 10 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 34fbf95bbe14..a55d63efab0f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -56,7 +56,6 @@ extern void fpu__restore(void); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); -extern void fpu__reset(struct fpu *fpu); extern void fpu__clear(struct fpu *fpu); extern void fpu__init_check_bugs(void); @@ -490,7 +489,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switc { if (fpu_switch.preload) { if (unlikely(copy_fpstate_to_fpregs(new_fpu))) - fpu__reset(new_fpu); + fpu__clear(new_fpu); } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 0ccdd8348872..1ba866cce00a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) if (fpu->fpregs_active) { if (WARN_ON(copy_fpstate_to_fpregs(fpu))) - fpu__reset(fpu); + fpu__clear(fpu); } else { __fpregs_deactivate_hw(); } @@ -339,7 +339,7 @@ void fpu__restore(void) kernel_fpu_disable(); fpregs_activate(fpu); if (unlikely(copy_fpstate_to_fpregs(fpu))) { - fpu__reset(fpu); + fpu__clear(fpu); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); } else { tsk->thread.fpu.counter++; @@ -376,19 +376,10 @@ void fpu__drop(struct fpu *fpu) } /* - * Reset the FPU state back to init state: - */ -void fpu__reset(struct fpu *fpu) -{ - if (!use_eager_fpu()) - fpu__drop(fpu); - else - restore_init_xstate(); -} - -/* - * Called by sys_execve() to clear the FPU fpregs, so that FPU state - * of the previous binary does not leak over into the exec()ed binary: + * Clear the FPU state back to init state. + * + * Called by sys_execve(), by the signal handler code and by various + * error paths. */ void fpu__clear(struct fpu *fpu) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 810f080fadf3..9bc3734acc16 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -401,7 +401,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) config_enabled(CONFIG_IA32_EMULATION)); if (!buf) { - fpu__reset(fpu); + fpu__clear(fpu); return 0; } @@ -474,7 +474,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ user_fpu_begin(); if (restore_user_xstate(buf_fx, xfeatures, fx_only)) { - fpu__reset(fpu); + fpu__clear(fpu); return -1; } } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9554ca69a84e..7c08795073d2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -667,7 +667,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * Ensure the signal handler starts with the new fpu state. */ if (fpu->fpstate_active) - fpu__reset(fpu); + fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); } -- cgit v1.2.1 From 05012c13f69d67be8a6a7e65726eeb70899ad6ad Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 07:26:04 +0200 Subject: x86/fpu: Move is_ia32*frame() helpers out of fpu/internal.h Move them to their only user. This makes the code easier to read, the header is less cluttered, and it also speeds up the build a bit. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 16 ---------------- arch/x86/kernel/signal.c | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index a55d63efab0f..dc4842b0831b 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -103,22 +103,6 @@ static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } -static inline int is_ia32_compat_frame(void) -{ - return config_enabled(CONFIG_IA32_EMULATION) && - test_thread_flag(TIF_IA32); -} - -static inline int is_ia32_frame(void) -{ - return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); -} - -static inline int is_x32_frame(void) -{ - return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); -} - #define X87_FSW_ES (1 << 7) /* Exception Summary */ static __always_inline __pure bool use_eager_fpu(void) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 7c08795073d2..f4b205686527 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -593,6 +593,22 @@ badframe: return 0; } +static inline int is_ia32_compat_frame(void) +{ + return config_enabled(CONFIG_IA32_EMULATION) && + test_thread_flag(TIF_IA32); +} + +static inline int is_ia32_frame(void) +{ + return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); +} + +static inline int is_x32_frame(void) +{ + return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); +} + static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { -- cgit v1.2.1 From fcbc99c403c4a1a24ac4744e08c04da3ec18a68c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 08:45:02 +0200 Subject: x86/fpu: Split out fpu/signal.h from fpu/internal.h for signal frame handling functions Most of the FPU does not use them, so split it out and include them in signal.c and ia32_signal.c Also fix header file dependency assumption in fpu/core.c. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 1 + arch/x86/include/asm/fpu/internal.h | 24 ------------------------ arch/x86/include/asm/fpu/signal.h | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/core.c | 2 ++ arch/x86/kernel/fpu/xstate.c | 1 + arch/x86/kernel/ptrace.c | 1 + arch/x86/kernel/signal.c | 1 + 7 files changed, 37 insertions(+), 24 deletions(-) create mode 100644 arch/x86/include/asm/fpu/signal.h (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 54605eb1631f..ae3a29ae875b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index dc4842b0831b..e2ceb49d310d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -19,21 +19,6 @@ #include #include -#ifdef CONFIG_X86_64 -# include -# include -struct ksignal; -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -#else -# define user_i387_ia32_struct user_i387_struct -# define user32_fxsr_struct user_fxsr_struct -# define ia32_setup_frame __setup_frame -# define ia32_setup_rt_frame __setup_rt_frame -#endif - #define MXCSR_DEFAULT 0x1f80 extern unsigned int mxcsr_feature_mask; @@ -63,11 +48,6 @@ extern void fpu__resume_cpu(void); DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -extern void convert_from_fxsr(struct user_i387_ia32_struct *env, - struct task_struct *tsk); -extern void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env); - extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; @@ -530,8 +510,4 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } -unsigned long -fpu__alloc_mathframe(unsigned long sp, int ia32_frame, - unsigned long *buf_fx, unsigned long *size); - #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h new file mode 100644 index 000000000000..0803dc2aba80 --- /dev/null +++ b/arch/x86/include/asm/fpu/signal.h @@ -0,0 +1,31 @@ +/* + * x86 FPU signal frame handling methods: + */ +#ifndef _ASM_X86_FPU_SIGNAL_H +#define _ASM_X86_FPU_SIGNAL_H + +#ifdef CONFIG_X86_64 +# include +# include +struct ksignal; +int ia32_setup_rt_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs); +#else +# define user_i387_ia32_struct user_i387_struct +# define user32_fxsr_struct user_fxsr_struct +# define ia32_setup_frame __setup_frame +# define ia32_setup_rt_frame __setup_rt_frame +#endif + +extern void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk); +extern void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env); + +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size); + +#endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1ba866cce00a..b61a5136c0e0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -6,6 +6,8 @@ * Gareth Hughes , May 2000 */ #include +#include + #include /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9bc3734acc16..78710740e9a0 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 4c615661ec72..51e73a685ce4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f4b205686527..206996c1669d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.1 From 59a36d16be8f9f68410f1bd396577fb7f31ae877 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 08:53:18 +0200 Subject: x86/fpu: Factor out fpu/regset.h from fpu/internal.h Only a few places use the regset definitions, so factor them out. Also fix related header dependency assumptions. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 13 ------------- arch/x86/include/asm/fpu/regset.h | 21 +++++++++++++++++++++ arch/x86/include/asm/fpu/xstate.h | 1 + arch/x86/kernel/fpu/core.c | 1 + arch/x86/kernel/fpu/xstate.c | 2 ++ arch/x86/kernel/ptrace.c | 2 +- 6 files changed, 26 insertions(+), 14 deletions(-) create mode 100644 arch/x86/include/asm/fpu/regset.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index e2ceb49d310d..7b62d9032623 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -10,7 +10,6 @@ #ifndef _ASM_X86_FPU_INTERNAL_H #define _ASM_X86_FPU_INTERNAL_H -#include #include #include #include @@ -48,18 +47,6 @@ extern void fpu__resume_cpu(void); DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; - -/* - * xstateregs_active == regset_fpregs_active. Please refer to the comment - * at the definition of regset_fpregs_active. - */ -#define xstateregs_active regset_fpregs_active - #ifdef CONFIG_MATH_EMULATION extern void finit_soft_fpu(struct i387_soft_struct *soft); #else diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h new file mode 100644 index 000000000000..39d3107ac6c7 --- /dev/null +++ b/arch/x86/include/asm/fpu/regset.h @@ -0,0 +1,21 @@ +/* + * FPU regset handling methods: + */ +#ifndef _ASM_X86_FPU_REGSET_H +#define _ASM_X86_FPU_REGSET_H + +#include + +extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == regset_fpregs_active. Please refer to the comment + * at the definition of regset_fpregs_active. + */ +#define xstateregs_active regset_fpregs_active + +#endif /* _ASM_X86_FPU_REGSET_H */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index ab2c507b58b6..afd21329c585 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -3,6 +3,7 @@ #include #include +#include /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b61a5136c0e0..07f1cc9d18d9 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -6,6 +6,7 @@ * Gareth Hughes , May 2000 */ #include +#include #include #include diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 78710740e9a0..59bd35a57afc 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -5,9 +5,11 @@ */ #include #include + #include #include #include +#include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 51e73a685ce4..9be72bc3613f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -30,6 +29,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.1 From acd58a3ad0ed5875fb88bbb078309a3d7ee147a0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 09:17:50 +0200 Subject: x86/fpu: Remove run-once init quirks Remove various boot quirks that came from the old code. The new code is cleanly split up into per-system and per-cpu init sequences, and system init functions are only called once. Remove the run-once quirks. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 6 ------ arch/x86/kernel/fpu/xstate.c | 11 ----------- 2 files changed, 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 72219ce2385a..7b6265df6082 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -143,12 +143,6 @@ EXPORT_SYMBOL_GPL(xstate_size); */ static void fpu__init_system_xstate_size_legacy(void) { - static bool on_boot_cpu = 1; - - if (!on_boot_cpu) - return; - on_boot_cpu = 0; - /* * Note that xstate_size might be overwriten later during * fpu__init_system_xstate(). diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 59bd35a57afc..8285d4b40763 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -658,12 +658,6 @@ void setup_xstate_comp(void) */ static void setup_init_fpu_buf(void) { - static int on_boot_cpu = 1; - - if (!on_boot_cpu) - return; - on_boot_cpu = 0; - if (!cpu_has_xsave) return; @@ -719,11 +713,6 @@ static void __init init_xstate_size(void) void fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; - static bool on_boot_cpu = 1; - - if (!on_boot_cpu) - return; - on_boot_cpu = 0; if (!cpu_has_xsave) { pr_info("x86/fpu: Legacy x87 FPU detected.\n"); -- cgit v1.2.1 From e1cebad49c54e0241e101ebf63d5238fe1137749 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 09:29:38 +0200 Subject: x86/fpu: Factor out the exception error code handling code Factor out the FPU error code handling code from traps.c and fpu/internal.h and move them close to each other. Also convert the helper functions to 'struct fpu *', which further simplifies them. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 33 +------------- arch/x86/kernel/fpu/core.c | 88 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 67 +++++----------------------- 3 files changed, 102 insertions(+), 86 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7b62d9032623..dfdafea6e56f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -30,7 +30,8 @@ extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__activate_curr(struct fpu *fpu); extern void fpstate_init(struct fpu *fpu); -extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +extern int fpu__exception_code(struct fpu *fpu, int trap_nr); /* * High level FPU state handling functions: @@ -467,34 +468,4 @@ static inline void user_fpu_begin(void) preempt_enable(); } -/* - * i387 state interaction - */ -static inline unsigned short get_fpu_cwd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state.fxsave.cwd; - } else { - return (unsigned short)tsk->thread.fpu.state.fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state.fxsave.swd; - } else { - return (unsigned short)tsk->thread.fpu.state.fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) -{ - if (cpu_has_xmm) { - return tsk->thread.fpu.state.fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 07f1cc9d18d9..4809c32149e8 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -749,3 +750,90 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) EXPORT_SYMBOL(dump_fpu); #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ + +/* + * x87 math exception handling: + */ + +static inline unsigned short get_fpu_cwd(struct fpu *fpu) +{ + if (cpu_has_fxsr) { + return fpu->state.fxsave.cwd; + } else { + return (unsigned short)fpu->state.fsave.cwd; + } +} + +static inline unsigned short get_fpu_swd(struct fpu *fpu) +{ + if (cpu_has_fxsr) { + return fpu->state.fxsave.swd; + } else { + return (unsigned short)fpu->state.fsave.swd; + } +} + +static inline unsigned short get_fpu_mxcsr(struct fpu *fpu) +{ + if (cpu_has_xmm) { + return fpu->state.fxsave.mxcsr; + } else { + return MXCSR_DEFAULT; + } +} + +int fpu__exception_code(struct fpu *fpu, int trap_nr) +{ + int err; + + if (trap_nr == X86_TRAP_MF) { + unsigned short cwd, swd; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(fpu); + swd = get_fpu_swd(fpu); + + err = swd & ~cwd; + } else { + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + unsigned short mxcsr = get_fpu_mxcsr(fpu); + err = ~(mxcsr >> 7) & mxcsr; + } + + if (err & 0x001) { /* Invalid op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + return FPE_FLTINV; + } else if (err & 0x004) { /* Divide by Zero */ + return FPE_FLTDIV; + } else if (err & 0x008) { /* Overflow */ + return FPE_FLTOVF; + } else if (err & 0x012) { /* Denormal, Underflow */ + return FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ + return FPE_FLTRES; + } + + /* + * If we're using IRQ 13, or supposedly even some trap + * X86_TRAP_MF implementations, it's possible + * we get a spurious trap, which is not an error. + */ + return 0; +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 48dfcd9ed351..cab397d0085f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -708,8 +708,8 @@ NOKPROBE_SYMBOL(do_debug); static void math_error(struct pt_regs *regs, int error_code, int trapnr) { struct task_struct *task = current; + struct fpu *fpu = &task->thread.fpu; siginfo_t info; - unsigned short err; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; @@ -717,8 +717,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) return; conditional_sti(regs); - if (!user_mode(regs)) - { + if (!user_mode(regs)) { if (!fixup_exception(regs)) { task->thread.error_code = error_code; task->thread.trap_nr = trapnr; @@ -730,62 +729,20 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) /* * Save the info for the exception handler and clear the error. */ - fpu__save(&task->thread.fpu); - task->thread.trap_nr = trapnr; + fpu__save(fpu); + + task->thread.trap_nr = trapnr; task->thread.error_code = error_code; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_addr = (void __user *)uprobe_get_trap_addr(regs); - if (trapnr == X86_TRAP_MF) { - unsigned short cwd, swd; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_addr = (void __user *)uprobe_get_trap_addr(regs); - err = swd & ~cwd; - } else { - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - unsigned short mxcsr = get_fpu_mxcsr(task); - err = ~(mxcsr >> 7) & mxcsr; - } + info.si_code = fpu__exception_code(fpu, trapnr); - if (err & 0x001) { /* Invalid op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - } else if (err & 0x004) { /* Divide by Zero */ - info.si_code = FPE_FLTDIV; - } else if (err & 0x008) { /* Overflow */ - info.si_code = FPE_FLTOVF; - } else if (err & 0x012) { /* Denormal, Underflow */ - info.si_code = FPE_FLTUND; - } else if (err & 0x020) { /* Precision */ - info.si_code = FPE_FLTRES; - } else { - /* - * If we're using IRQ 13, or supposedly even some trap - * X86_TRAP_MF implementations, it's possible - * we get a spurious trap, which is not an error. - */ + /* Retry when we get spurious exceptions: */ + if (!info.si_code) return; - } + force_sig_info(SIGFPE, &info, task); } -- cgit v1.2.1 From 0aba69789452faab6f6bd7cd293489bab66352bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 10:08:36 +0200 Subject: x86/fpu: Harmonize the names of the fpstate_init() helper functions Harmonize the inconsistent naming of these related functions: fpstate_init() finit_soft_fpu() => fpstate_init_fsoft() fx_finit() => fpstate_init_fxstate() fx_finit() => fpstate_init_fstate() # split out Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 23 +++++++++++------------ arch/x86/kernel/fpu/core.c | 26 ++++++++++++++++---------- arch/x86/kernel/fpu/init.c | 2 +- arch/x86/math-emu/fpu_aux.c | 4 ++-- 4 files changed, 30 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index dfdafea6e56f..0236ae6ffc26 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -28,7 +28,18 @@ extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__activate_curr(struct fpu *fpu); + extern void fpstate_init(struct fpu *fpu); +#ifdef CONFIG_MATH_EMULATION +extern void fpstate_init_soft(struct i387_soft_struct *soft); +#else +static inline void fpstate_init_soft(struct i387_soft_struct *soft) {} +#endif +static inline void fpstate_init_fxstate(struct i387_fxsave_struct *fx) +{ + fx->cwd = 0x37f; + fx->mxcsr = MXCSR_DEFAULT; +} extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); @@ -48,12 +59,6 @@ extern void fpu__resume_cpu(void); DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - /* * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, * on this CPU. @@ -93,12 +98,6 @@ static __always_inline __pure bool use_fxsr(void) return static_cpu_has_safe(X86_FEATURE_FXSR); } -static inline void fx_finit(struct i387_fxsave_struct *fx) -{ - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; -} - extern void fpstate_sanitize_xstate(struct fpu *fpu); #define user_insn(insn, output, input...) \ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 4809c32149e8..494ab4c57268 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -191,24 +191,30 @@ void fpu__save(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpu__save); +/* + * Legacy x87 fpstate state init: + */ +static inline void fpstate_init_fstate(struct i387_fsave_struct *fp) +{ + fp->cwd = 0xffff037fu; + fp->swd = 0xffff0000u; + fp->twd = 0xffffffffu; + fp->fos = 0xffff0000u; +} + void fpstate_init(struct fpu *fpu) { if (!cpu_has_fpu) { - finit_soft_fpu(&fpu->state.soft); + fpstate_init_soft(&fpu->state.soft); return; } memset(&fpu->state, 0, xstate_size); - if (cpu_has_fxsr) { - fx_finit(&fpu->state.fxsave); - } else { - struct i387_fsave_struct *fp = &fpu->state.fsave; - fp->cwd = 0xffff037fu; - fp->swd = 0xffff0000u; - fp->twd = 0xffffffffu; - fp->fos = 0xffff0000u; - } + if (cpu_has_fxsr) + fpstate_init_fxstate(&fpu->state.fxsave); + else + fpstate_init_fstate(&fpu->state.fsave); } EXPORT_SYMBOL_GPL(fpstate_init); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7b6265df6082..5a7e57078935 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -121,7 +121,7 @@ static void fpu__init_system_generic(void) * Set up the legacy init FPU context. (xstate init might overwrite this * with a more modern format, if the CPU supports it.) */ - fx_finit(&init_xstate_ctx.i387); + fpstate_init_fxstate(&init_xstate_ctx.i387); fpu__init_system_mxcsr(); } diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 7562341ce299..768b2b8271d6 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -30,7 +30,7 @@ static void fclex(void) } /* Needs to be externally visible */ -void finit_soft_fpu(struct i387_soft_struct *soft) +void fpstate_init_soft(struct i387_soft_struct *soft) { struct address *oaddr, *iaddr; memset(soft, 0, sizeof(*soft)); @@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft) void finit(void) { - finit_soft_fpu(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.state.soft); } /* -- cgit v1.2.1 From bf935b0b526ffa0607476dfc6198593553957dd9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 10:23:42 +0200 Subject: x86/fpu: Create 'union thread_xstate' helper for fpstate_init() fpstate_init() only uses fpu->state, so pass that in to it. This enables the cleanup we will do in the next patch. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 14 +++++++------- arch/x86/kernel/fpu/xstate.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0236ae6ffc26..b74aa4329aeb 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -29,7 +29,7 @@ extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__activate_curr(struct fpu *fpu); -extern void fpstate_init(struct fpu *fpu); +extern void fpstate_init(union thread_xstate *state); #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct i387_soft_struct *soft); #else diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 494ab4c57268..8e4cad57be3d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -202,19 +202,19 @@ static inline void fpstate_init_fstate(struct i387_fsave_struct *fp) fp->fos = 0xffff0000u; } -void fpstate_init(struct fpu *fpu) +void fpstate_init(union thread_xstate *state) { if (!cpu_has_fpu) { - fpstate_init_soft(&fpu->state.soft); + fpstate_init_soft(&state->soft); return; } - memset(&fpu->state, 0, xstate_size); + memset(state, 0, xstate_size); if (cpu_has_fxsr) - fpstate_init_fxstate(&fpu->state.fxsave); + fpstate_init_fxstate(&state->fxsave); else - fpstate_init_fstate(&fpu->state.fsave); + fpstate_init_fstate(&state->fsave); } EXPORT_SYMBOL_GPL(fpstate_init); @@ -282,7 +282,7 @@ void fpu__activate_curr(struct fpu *fpu) WARN_ON_ONCE(fpu != ¤t->thread.fpu); if (!fpu->fpstate_active) { - fpstate_init(fpu); + fpstate_init(&fpu->state); /* Safe to do for the current task: */ fpu->fpstate_active = 1; @@ -321,7 +321,7 @@ static void fpu__activate_stopped(struct fpu *child_fpu) if (child_fpu->fpstate_active) { child_fpu->last_cpu = -1; } else { - fpstate_init(child_fpu); + fpstate_init(&child_fpu->state); /* Safe to do for stopped child tasks: */ child_fpu->fpstate_active = 1; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 8285d4b40763..afbd58277430 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -456,7 +456,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { - fpstate_init(fpu); + fpstate_init(&fpu->state); err = -1; } else { sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f4438179398b..3d811bb2728f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7004,7 +7004,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) static void fx_init(struct kvm_vcpu *vcpu) { - fpstate_init(&vcpu->arch.guest_fpu); + fpstate_init(&vcpu->arch.guest_fpu.state); if (cpu_has_xsaves) vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; -- cgit v1.2.1 From 6f57502310c85b60bdea78228e9b5bb3e82dc3b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 11:07:06 +0200 Subject: x86/fpu: Generalize 'init_xstate_ctx' So the handling of init_xstate_ctx has a layering violation: both 'struct xsave_struct' and 'union thread_xstate' have a 'struct i387_fxsave_struct' member: xsave_struct::i387 thread_xstate::fxsave The handling of init_xstate_ctx is generic, it is used on all CPUs, with or without XSAVE instruction. So it's confusing how the generic code passes around and handles an XSAVE specific format. What we really want is for init_xstate_ctx to be a proper fpstate and we use its ::fxsave and ::xsave members, as appropriate. Since the xsave_struct::i387 and thread_xstate::fxsave aliases each other this is not a functional problem. So implement this, and move init_xstate_ctx to the generic FPU code in the process. Also, since init_xstate_ctx is not XSAVE specific anymore, rename it to init_fpstate, and mark it __read_mostly, because it's only modified once during bootup, and used as a reference fpstate later on. There's no change in functionality. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 6 ++++-- arch/x86/include/asm/fpu/xstate.h | 1 - arch/x86/kernel/fpu/core.c | 6 ++++++ arch/x86/kernel/fpu/init.c | 2 +- arch/x86/kernel/fpu/xstate.c | 19 +++++++------------ 5 files changed, 18 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b74aa4329aeb..792fdbe64179 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -22,6 +22,8 @@ extern unsigned int mxcsr_feature_mask; +extern union thread_xstate init_fpstate; + extern void fpu__init_cpu(void); extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); @@ -342,9 +344,9 @@ static inline void fpregs_deactivate(struct fpu *fpu) static inline void restore_init_xstate(void) { if (use_xsave()) - xrstor_state(&init_xstate_ctx, -1); + xrstor_state(&init_fpstate.xsave, -1); else - fxrstor_checking(&init_xstate_ctx.i387); + fxrstor_checking(&init_fpstate.fxsave); } /* diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index afd21329c585..3051280887b8 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -37,7 +37,6 @@ extern unsigned int xstate_size; extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern struct xsave_struct init_xstate_ctx; extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8e4cad57be3d..a396f80b771f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -12,6 +12,12 @@ #include +/* + * Represents the initial FPU state. It's mostly (but not completely) zeroes, + * depending on the FPU hardware format: + */ +union thread_xstate init_fpstate __read_mostly; + /* * Track whether the kernel is using the FPU state * currently. diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 5a7e57078935..93bc11a5812c 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -121,7 +121,7 @@ static void fpu__init_system_generic(void) * Set up the legacy init FPU context. (xstate init might overwrite this * with a more modern format, if the CPU supports it.) */ - fpstate_init_fxstate(&init_xstate_ctx.i387); + fpstate_init_fxstate(&init_fpstate.fxsave); fpu__init_system_mxcsr(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index afbd58277430..527d4bf4f304 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -31,11 +31,6 @@ static const char *xfeature_names[] = */ u64 xfeatures_mask __read_mostly; -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct init_xstate_ctx; - static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX]; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; @@ -150,7 +145,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) int size = xstate_sizes[feature_bit]; memcpy((void *)fx + offset, - (void *)&init_xstate_ctx + offset, + (void *)&init_fpstate.xsave + offset, size); } @@ -377,12 +372,12 @@ static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) if (use_xsave()) { if ((unsigned long)buf % 64 || fx_only) { u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; - xrstor_state(&init_xstate_ctx, init_bv); + xrstor_state(&init_fpstate.xsave, init_bv); return fxrstor_user(buf); } else { u64 init_bv = xfeatures_mask & ~xbv; if (unlikely(init_bv)) - xrstor_state(&init_xstate_ctx, init_bv); + xrstor_state(&init_fpstate.xsave, init_bv); return xrestore_user(buf, xbv); } } else if (use_fxsr()) { @@ -665,20 +660,20 @@ static void setup_init_fpu_buf(void) print_xstate_features(); if (cpu_has_xsaves) { - init_xstate_ctx.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; - init_xstate_ctx.header.xfeatures = xfeatures_mask; + init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; + init_fpstate.xsave.header.xfeatures = xfeatures_mask; } /* * Init all the features state with header_bv being 0x0 */ - xrstor_state_booting(&init_xstate_ctx, -1); + xrstor_state_booting(&init_fpstate.xsave, -1); /* * Dump the init state again. This is to identify the init state * of any feature which is not represented by all zero's. */ - xsave_state_booting(&init_xstate_ctx); + xsave_state_booting(&init_fpstate.xsave); } /* -- cgit v1.2.1 From 815418890e2a3984d8b04c433072df1a42573f96 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 11:21:59 +0200 Subject: x86/fpu: Move restore_init_xstate() out of fpu/internal.h Move restore_init_xstate() next to its sole caller. Also rename it to copy_init_fpstate_to_fpregs() and add some comments about what it does. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 8 -------- arch/x86/kernel/fpu/core.c | 14 +++++++++++++- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 792fdbe64179..a1810eb39afa 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -341,14 +341,6 @@ static inline void fpregs_deactivate(struct fpu *fpu) __fpregs_deactivate_hw(); } -static inline void restore_init_xstate(void) -{ - if (use_xsave()) - xrstor_state(&init_fpstate.xsave, -1); - else - fxrstor_checking(&init_fpstate.fxsave); -} - /* * Definitions for the eXtended Control Register instructions */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a396f80b771f..2ea9e2f9c486 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -391,6 +391,18 @@ void fpu__drop(struct fpu *fpu) preempt_enable(); } +/* + * Clear FPU registers by setting them up from + * the init fpstate: + */ +static inline void copy_init_fpstate_to_fpregs(void) +{ + if (use_xsave()) + xrstor_state(&init_fpstate.xsave, -1); + else + fxrstor_checking(&init_fpstate.fxsave); +} + /* * Clear the FPU state back to init state. * @@ -409,7 +421,7 @@ void fpu__clear(struct fpu *fpu) fpu__activate_curr(fpu); user_fpu_begin(); } - restore_init_xstate(); + copy_init_fpstate_to_fpregs(); } } -- cgit v1.2.1 From c681314421c7c70c418190f3b4ffb4d3257ea5be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 11:34:09 +0200 Subject: x86/fpu: Rename all the fpregs, xregs, fxregs and fregs handling functions Standardize the naming of the various functions that copy register content in specific FPU context formats: copy_fxregs_to_kernel() # was: fpu_fxsave() copy_xregs_to_kernel() # was: xsave_state() copy_kernel_to_fregs() # was: frstor_checking() copy_kernel_to_fxregs() # was: fxrstor_checking() copy_kernel_to_xregs() # was: fpu_xrstor_checking() copy_kernel_to_xregs_booting() # was: xrstor_state_booting() copy_fregs_to_user() # was: fsave_user() copy_fxregs_to_user() # was: fxsave_user() copy_xregs_to_user() # was: xsave_user() copy_user_to_fregs() # was: frstor_user() copy_user_to_fxregs() # was: fxrstor_user() copy_user_to_xregs() # was: xrestore_user() copy_user_to_fpregs_zeroing() # was: restore_user_xstate() Eliminate fpu_xrstor_checking(), because it was just a wrapper. No change in functionality. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 30 +++++++++++++++--------------- arch/x86/include/asm/fpu/xstate.h | 20 ++++++-------------- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/fpu/xstate.c | 28 ++++++++++++++-------------- arch/x86/mm/mpx.c | 2 +- 5 files changed, 38 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index a1810eb39afa..f23ea10d3a1f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -133,57 +133,57 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) -static inline int fsave_user(struct i387_fsave_struct __user *fx) +static inline int copy_fregs_to_user(struct i387_fsave_struct __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } -static inline int fxsave_user(struct i387_fxsave_struct __user *fx) +static inline int copy_fxregs_to_user(struct i387_fxsave_struct __user *fx) { if (config_enabled(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); else if (config_enabled(CONFIG_AS_FXSAVEQ)) return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - /* See comment in fpu_fxsave() below. */ + /* See comment in copy_fxregs_to_kernel() below. */ return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); } -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +static inline int copy_kernel_to_fxregs(struct i387_fxsave_struct *fx) { if (config_enabled(CONFIG_X86_32)) return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else if (config_enabled(CONFIG_AS_FXSAVEQ)) return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - /* See comment in fpu_fxsave() below. */ + /* See comment in copy_fxregs_to_kernel() below. */ return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } -static inline int fxrstor_user(struct i387_fxsave_struct __user *fx) +static inline int copy_user_to_fxregs(struct i387_fxsave_struct __user *fx) { if (config_enabled(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else if (config_enabled(CONFIG_AS_FXSAVEQ)) return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - /* See comment in fpu_fxsave() below. */ + /* See comment in copy_fxregs_to_kernel() below. */ return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } -static inline int frstor_checking(struct i387_fsave_struct *fx) +static inline int copy_kernel_to_fregs(struct i387_fsave_struct *fx) { return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline int frstor_user(struct i387_fsave_struct __user *fx) +static inline int copy_user_to_fregs(struct i387_fsave_struct __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline void fpu_fxsave(struct fpu *fpu) +static inline void copy_fxregs_to_kernel(struct fpu *fpu) { if (config_enabled(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); @@ -230,12 +230,12 @@ static inline void fpu_fxsave(struct fpu *fpu) static inline int copy_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { - xsave_state(&fpu->state.xsave); + copy_xregs_to_kernel(&fpu->state.xsave); return 1; } if (likely(use_fxsr())) { - fpu_fxsave(fpu); + copy_fxregs_to_kernel(fpu); return 1; } @@ -251,11 +251,11 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) static inline int __copy_fpstate_to_fpregs(struct fpu *fpu) { if (use_xsave()) - return fpu_xrstor_checking(&fpu->state.xsave); + return copy_kernel_to_xregs(&fpu->state.xsave, -1); else if (use_fxsr()) - return fxrstor_checking(&fpu->state.fxsave); + return copy_kernel_to_fxregs(&fpu->state.fxsave); else - return frstor_checking(&fpu->state.fsave); + return copy_kernel_to_fregs(&fpu->state.fsave); } static inline int copy_fpstate_to_fpregs(struct fpu *fpu) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 3051280887b8..7f59480697a3 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -58,7 +58,7 @@ extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int xsave_state_booting(struct xsave_struct *fx) +static inline int copy_xregs_to_kernel_booting(struct xsave_struct *fx) { u64 mask = -1; u32 lmask = mask; @@ -86,7 +86,7 @@ static inline int xsave_state_booting(struct xsave_struct *fx) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) +static inline int copy_kernel_to_xregs_booting(struct xsave_struct *fx, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; @@ -112,7 +112,7 @@ static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) /* * Save processor xstate to xsave area. */ -static inline int xsave_state(struct xsave_struct *fx) +static inline int copy_xregs_to_kernel(struct xsave_struct *fx) { u64 mask = -1; u32 lmask = mask; @@ -151,7 +151,7 @@ static inline int xsave_state(struct xsave_struct *fx) /* * Restore processor xstate from xsave area. */ -static inline int xrstor_state(struct xsave_struct *fx, u64 mask) +static inline int copy_kernel_to_xregs(struct xsave_struct *fx, u64 mask) { int err = 0; u32 lmask = mask; @@ -176,14 +176,6 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask) return err; } -/* - * Restore xstate context for new process during context switch. - */ -static inline int fpu_xrstor_checking(struct xsave_struct *fx) -{ - return xrstor_state(fx, -1); -} - /* * Save xstate to user space xsave area. * @@ -194,7 +186,7 @@ static inline int fpu_xrstor_checking(struct xsave_struct *fx) * backward compatibility for old applications which don't understand * compacted format of xsave area. */ -static inline int xsave_user(struct xsave_struct __user *buf) +static inline int copy_xregs_to_user(struct xsave_struct __user *buf) { int err; @@ -218,7 +210,7 @@ static inline int xsave_user(struct xsave_struct __user *buf) /* * Restore xstate from user space xsave area. */ -static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) +static inline int copy_user_to_xregs(struct xsave_struct __user *buf, u64 mask) { int err = 0; struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 2ea9e2f9c486..2cb75bd40dc0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -398,9 +398,9 @@ void fpu__drop(struct fpu *fpu) static inline void copy_init_fpstate_to_fpregs(void) { if (use_xsave()) - xrstor_state(&init_fpstate.xsave, -1); + copy_kernel_to_xregs(&init_fpstate.xsave, -1); else - fxrstor_checking(&init_fpstate.fxsave); + copy_kernel_to_fxregs(&init_fpstate.fxsave); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 527d4bf4f304..336b3dae59ca 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -260,11 +260,11 @@ static inline int copy_fpregs_to_sigframe(struct xsave_struct __user *buf) int err; if (use_xsave()) - err = xsave_user(buf); + err = copy_xregs_to_user(buf); else if (use_fxsr()) - err = fxsave_user((struct i387_fxsave_struct __user *) buf); + err = copy_fxregs_to_user((struct i387_fxsave_struct __user *) buf); else - err = fsave_user((struct i387_fsave_struct __user *) buf); + err = copy_fregs_to_user((struct i387_fsave_struct __user *) buf); if (unlikely(err) && __clear_user(buf, xstate_size)) err = -EFAULT; @@ -314,7 +314,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) return -1; /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) - fpu_fxsave(&tsk->thread.fpu); + copy_fxregs_to_kernel(&tsk->thread.fpu); } else { fpstate_sanitize_xstate(&tsk->thread.fpu); if (__copy_to_user(buf_fx, xsave, xstate_size)) @@ -367,23 +367,23 @@ sanitize_restored_xstate(struct task_struct *tsk, /* * Restore the extended state if present. Otherwise, restore the FP/SSE state. */ -static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) +static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { if (use_xsave()) { if ((unsigned long)buf % 64 || fx_only) { u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; - xrstor_state(&init_fpstate.xsave, init_bv); - return fxrstor_user(buf); + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return copy_user_to_fxregs(buf); } else { u64 init_bv = xfeatures_mask & ~xbv; if (unlikely(init_bv)) - xrstor_state(&init_fpstate.xsave, init_bv); - return xrestore_user(buf, xbv); + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return copy_user_to_xregs(buf, xbv); } } else if (use_fxsr()) { - return fxrstor_user(buf); + return copy_user_to_fxregs(buf); } else - return frstor_user(buf); + return copy_user_to_fregs(buf); } static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) @@ -471,7 +471,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * state to the registers directly (with exceptions handled). */ user_fpu_begin(); - if (restore_user_xstate(buf_fx, xfeatures, fx_only)) { + if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { fpu__clear(fpu); return -1; } @@ -667,13 +667,13 @@ static void setup_init_fpu_buf(void) /* * Init all the features state with header_bv being 0x0 */ - xrstor_state_booting(&init_fpstate.xsave, -1); + copy_kernel_to_xregs_booting(&init_fpstate.xsave, -1); /* * Dump the init state again. This is to identify the init state * of any feature which is not represented by all zero's. */ - xsave_state_booting(&init_fpstate.xsave); + copy_xregs_to_kernel_booting(&init_fpstate.xsave); } /* diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index ea5b367b63a9..5e20bacee210 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -389,7 +389,7 @@ int mpx_enable_management(struct task_struct *tsk) * directory into XSAVE/XRSTOR Save Area and enable MPX through * XRSTOR instruction. * - * xsave_state() is expected to be very expensive. Storing the bounds + * copy_xregs_to_kernel() is expected to be very expensive. Storing the bounds * directory here means that we do not have to do xsave in the unmap * path; we can just use mm->bd_addr instead. */ -- cgit v1.2.1 From b992c660d3b3161ddb992e1b0cac727de574709c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 12:45:38 +0200 Subject: x86/fpu: Factor out fpu/signal.c fpu/xstate.c has a lot of generic FPU signal frame handling routines, move them into a separate file: fpu/signal.c. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/signal.h | 2 + arch/x86/kernel/fpu/Makefile | 2 +- arch/x86/kernel/fpu/signal.c | 404 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/xstate.c | 394 +------------------------------------ 4 files changed, 409 insertions(+), 393 deletions(-) create mode 100644 arch/x86/kernel/fpu/signal.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 0803dc2aba80..7358e9d61f1e 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -28,4 +28,6 @@ unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size); +extern void fpu__init_prepare_fx_sw_frame(void); + #endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile index 6ae59bccdd2f..5c697ded57f2 100644 --- a/arch/x86/kernel/fpu/Makefile +++ b/arch/x86/kernel/fpu/Makefile @@ -2,4 +2,4 @@ # Build rules for the FPU support code: # -obj-y += init.o bugs.o core.o xstate.o +obj-y += init.o bugs.o core.o signal.o xstate.o diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c new file mode 100644 index 000000000000..8d0c26ab5123 --- /dev/null +++ b/arch/x86/kernel/fpu/signal.c @@ -0,0 +1,404 @@ +/* + * FPU signal frame handling routines. + */ + +#include +#include + +#include +#include +#include + +#include + +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw) +{ + int min_xstate_size = sizeof(struct i387_fxsave_struct) + + sizeof(struct xstate_header); + unsigned int magic2; + + if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) + return -1; + + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fx_sw->extended_size) + return -1; + + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) + || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + +/* + * Signal frame handlers. + */ +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +{ + if (use_fxsr()) { + struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; + struct user_i387_ia32_struct env; + struct _fpstate_ia32 __user *fp = buf; + + convert_from_fxsr(&env, tsk); + + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return -1; + } else { + struct i387_fsave_struct __user *fp = buf; + u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) + return -1; + } + + return 0; +} + +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +{ + struct xsave_struct __user *x = buf; + struct _fpx_sw_bytes *sw_bytes; + u32 xfeatures; + int err; + + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; + err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + + if (!use_xsave()) + return err; + + err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + + /* + * Read the xfeatures which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xfeatures in the xsave header. + * + * xsave aware apps can change the xfeatures in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xfeatures |= XSTATE_FPSSE; + + err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); + + return err; +} + +static inline int copy_fpregs_to_sigframe(struct xsave_struct __user *buf) +{ + int err; + + if (use_xsave()) + err = copy_xregs_to_user(buf); + else if (use_fxsr()) + err = copy_fxregs_to_user((struct i387_fxsave_struct __user *) buf); + else + err = copy_fregs_to_user((struct i387_fsave_struct __user *) buf); + + if (unlikely(err) && __clear_user(buf, xstate_size)) + err = -EFAULT; + return err; +} + +/* + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * If the fpu, extended register state is live, save the state directly + * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, + * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. + */ +int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +{ + struct xsave_struct *xsave = ¤t->thread.fpu.state.xsave; + struct task_struct *tsk = current; + int ia32_fxstate = (buf != buf_fx); + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!access_ok(VERIFY_WRITE, buf, size)) + return -EACCES; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), NULL, + (struct _fpstate_ia32 __user *) buf) ? -1 : 1; + + if (fpregs_active()) { + /* Save the live register state to the user directly. */ + if (copy_fpregs_to_sigframe(buf_fx)) + return -1; + /* Update the thread's fxstate to save the fsave header. */ + if (ia32_fxstate) + copy_fxregs_to_kernel(&tsk->thread.fpu); + } else { + fpstate_sanitize_xstate(&tsk->thread.fpu); + if (__copy_to_user(buf_fx, xsave, xstate_size)) + return -1; + } + + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + return -1; + + if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + return -1; + + return 0; +} + +static inline void +sanitize_restored_xstate(struct task_struct *tsk, + struct user_i387_ia32_struct *ia32_env, + u64 xfeatures, int fx_only) +{ + struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; + struct xstate_header *header = &xsave->header; + + if (use_xsave()) { + /* These bits must be zero. */ + memset(header->reserved, 0, 48); + + /* + * Init the state that is not present in the memory + * layout and not enabled by the OS. + */ + if (fx_only) + header->xfeatures = XSTATE_FPSSE; + else + header->xfeatures &= (xfeatures_mask & xfeatures); + } + + if (use_fxsr()) { + /* + * mscsr reserved bits must be masked to zero for security + * reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + + convert_to_fxsr(tsk, ia32_env); + } +} + +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE state. + */ +static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) +{ + if (use_xsave()) { + if ((unsigned long)buf % 64 || fx_only) { + u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return copy_user_to_fxregs(buf); + } else { + u64 init_bv = xfeatures_mask & ~xbv; + if (unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return copy_user_to_xregs(buf, xbv); + } + } else if (use_fxsr()) { + return copy_user_to_fxregs(buf); + } else + return copy_user_to_fregs(buf); +} + +static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) +{ + int ia32_fxstate = (buf != buf_fx); + struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; + int state_size = xstate_size; + u64 xfeatures = 0; + int fx_only = 0; + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!buf) { + fpu__clear(fpu); + return 0; + } + + if (!access_ok(VERIFY_READ, buf, size)) + return -EACCES; + + fpu__activate_curr(fpu); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { + /* + * Couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + state_size = sizeof(struct i387_fxsave_struct); + fx_only = 1; + } else { + state_size = fx_sw_user.xstate_size; + xfeatures = fx_sw_user.xfeatures; + } + } + + if (ia32_fxstate) { + /* + * For 32-bit frames with fxstate, copy the user state to the + * thread's fpu state, reconstruct fxstate from the fsave + * header. Sanitize the copied state etc. + */ + struct fpu *fpu = &tsk->thread.fpu; + struct user_i387_ia32_struct env; + int err = 0; + + /* + * Drop the current fpu which clears fpu->fpstate_active. This ensures + * that any context-switch during the copy of the new state, + * avoids the intermediate state from getting restored/saved. + * Thus avoiding the new restored state from getting corrupted. + * We will be ready to restore/save the state only after + * fpu->fpstate_active is again set. + */ + fpu__drop(fpu); + + if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || + __copy_from_user(&env, buf, sizeof(env))) { + fpstate_init(&fpu->state); + err = -1; + } else { + sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); + } + + fpu->fpstate_active = 1; + if (use_eager_fpu()) { + preempt_disable(); + fpu__restore(); + preempt_enable(); + } + + return err; + } else { + /* + * For 64-bit frames and 32-bit fsave frames, restore the user + * state to the registers directly (with exceptions handled). + */ + user_fpu_begin(); + if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { + fpu__clear(fpu); + return -1; + } + } + + return 0; +} + +static inline int xstate_sigframe_size(void) +{ + return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; +} + +/* + * Restore FPU state from a sigframe: + */ +int fpu__restore_sig(void __user *buf, int ia32_frame) +{ + void __user *buf_fx = buf; + int size = xstate_sigframe_size(); + + if (ia32_frame && use_fxsr()) { + buf_fx = buf + sizeof(struct i387_fsave_struct); + size += sizeof(struct i387_fsave_struct); + } + + return __fpu__restore_sig(buf, buf_fx, size); +} + +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size) +{ + unsigned long frame_size = xstate_sigframe_size(); + + *buf_fx = sp = round_down(sp - frame_size, 64); + if (ia32_frame && use_fxsr()) { + frame_size += sizeof(struct i387_fsave_struct); + sp -= sizeof(struct i387_fsave_struct); + } + + *size = frame_size; + + return sp; +} +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +void fpu__init_prepare_fx_sw_frame(void) +{ + int fsave_header_size = sizeof(struct i387_fsave_struct); + int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + + if (config_enabled(CONFIG_X86_32)) + size += fsave_header_size; + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = size; + fx_sw_reserved.xfeatures = xfeatures_mask; + fx_sw_reserved.xstate_size = xstate_size; + + if (config_enabled(CONFIG_IA32_EMULATION)) { + fx_sw_reserved_ia32 = fx_sw_reserved; + fx_sw_reserved_ia32.extended_size += fsave_header_size; + } +} + diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 336b3dae59ca..3629e2ef3c94 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -10,7 +10,7 @@ #include #include #include -#include + #include static const char *xfeature_names[] = @@ -31,7 +31,6 @@ static const char *xfeature_names[] = */ u64 xfeatures_mask __read_mostly; -static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX]; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; @@ -154,395 +153,6 @@ void fpstate_sanitize_xstate(struct fpu *fpu) } } -/* - * Check for the presence of extended state information in the - * user fpstate pointer in the sigcontext. - */ -static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *fx_sw) -{ - int min_xstate_size = sizeof(struct i387_fxsave_struct) + - sizeof(struct xstate_header); - unsigned int magic2; - - if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) - return -1; - - /* Check for the first magic field and other error scenarios. */ - if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || - fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > xstate_size || - fx_sw->xstate_size > fx_sw->extended_size) - return -1; - - /* - * Check for the presence of second magic word at the end of memory - * layout. This detects the case where the user just copied the legacy - * fpstate layout with out copying the extended state information - * in the memory layout. - */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) - || magic2 != FP_XSTATE_MAGIC2) - return -1; - - return 0; -} - -/* - * Signal frame handlers. - */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) -{ - if (use_fxsr()) { - struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; - struct user_i387_ia32_struct env; - struct _fpstate_ia32 __user *fp = buf; - - convert_from_fxsr(&env, tsk); - - if (__copy_to_user(buf, &env, sizeof(env)) || - __put_user(xsave->i387.swd, &fp->status) || - __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; - } else { - struct i387_fsave_struct __user *fp = buf; - u32 swd; - if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; - } - - return 0; -} - -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) -{ - struct xsave_struct __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; - u32 xfeatures; - int err; - - /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); - - if (!use_xsave()) - return err; - - err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); - - /* - * Read the xfeatures which we copied (directly from the cpu or - * from the state in task struct) to the user buffers. - */ - err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); - - /* - * For legacy compatible, we always set FP/SSE bits in the bit - * vector while saving the state to the user context. This will - * enable us capturing any changes(during sigreturn) to - * the FP/SSE bits by the legacy applications which don't touch - * xfeatures in the xsave header. - * - * xsave aware apps can change the xfeatures in the xsave - * header as well as change any contents in the memory layout. - * xrestore as part of sigreturn will capture all the changes. - */ - xfeatures |= XSTATE_FPSSE; - - err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); - - return err; -} - -static inline int copy_fpregs_to_sigframe(struct xsave_struct __user *buf) -{ - int err; - - if (use_xsave()) - err = copy_xregs_to_user(buf); - else if (use_fxsr()) - err = copy_fxregs_to_user((struct i387_fxsave_struct __user *) buf); - else - err = copy_fregs_to_user((struct i387_fsave_struct __user *) buf); - - if (unlikely(err) && __clear_user(buf, xstate_size)) - err = -EFAULT; - return err; -} - -/* - * Save the fpu, extended register state to the user signal frame. - * - * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save - * state is copied. - * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. - * - * buf == buf_fx for 64-bit frames and 32-bit fsave frame. - * buf != buf_fx for 32-bit frames with fxstate. - * - * If the fpu, extended register state is live, save the state directly - * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, - * copy the thread's fpu state to the user frame starting at 'buf_fx'. - * - * If this is a 32-bit frame with fxstate, put a fsave header before - * the aligned state at 'buf_fx'. - * - * For [f]xsave state, update the SW reserved fields in the [f]xsave frame - * indicating the absence/presence of the extended state to the user. - */ -int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) -{ - struct xsave_struct *xsave = ¤t->thread.fpu.state.xsave; - struct task_struct *tsk = current; - int ia32_fxstate = (buf != buf_fx); - - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); - - if (!access_ok(VERIFY_WRITE, buf, size)) - return -EACCES; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), NULL, - (struct _fpstate_ia32 __user *) buf) ? -1 : 1; - - if (fpregs_active()) { - /* Save the live register state to the user directly. */ - if (copy_fpregs_to_sigframe(buf_fx)) - return -1; - /* Update the thread's fxstate to save the fsave header. */ - if (ia32_fxstate) - copy_fxregs_to_kernel(&tsk->thread.fpu); - } else { - fpstate_sanitize_xstate(&tsk->thread.fpu); - if (__copy_to_user(buf_fx, xsave, xstate_size)) - return -1; - } - - /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; - - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; - - return 0; -} - -static inline void -sanitize_restored_xstate(struct task_struct *tsk, - struct user_i387_ia32_struct *ia32_env, - u64 xfeatures, int fx_only) -{ - struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; - struct xstate_header *header = &xsave->header; - - if (use_xsave()) { - /* These bits must be zero. */ - memset(header->reserved, 0, 48); - - /* - * Init the state that is not present in the memory - * layout and not enabled by the OS. - */ - if (fx_only) - header->xfeatures = XSTATE_FPSSE; - else - header->xfeatures &= (xfeatures_mask & xfeatures); - } - - if (use_fxsr()) { - /* - * mscsr reserved bits must be masked to zero for security - * reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - - convert_to_fxsr(tsk, ia32_env); - } -} - -/* - * Restore the extended state if present. Otherwise, restore the FP/SSE state. - */ -static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) -{ - if (use_xsave()) { - if ((unsigned long)buf % 64 || fx_only) { - u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_fxregs(buf); - } else { - u64 init_bv = xfeatures_mask & ~xbv; - if (unlikely(init_bv)) - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_xregs(buf, xbv); - } - } else if (use_fxsr()) { - return copy_user_to_fxregs(buf); - } else - return copy_user_to_fregs(buf); -} - -static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) -{ - int ia32_fxstate = (buf != buf_fx); - struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; - int state_size = xstate_size; - u64 xfeatures = 0; - int fx_only = 0; - - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); - - if (!buf) { - fpu__clear(fpu); - return 0; - } - - if (!access_ok(VERIFY_READ, buf, size)) - return -EACCES; - - fpu__activate_curr(fpu); - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; - - if (use_xsave()) { - struct _fpx_sw_bytes fx_sw_user; - if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { - /* - * Couldn't find the extended state information in the - * memory layout. Restore just the FP/SSE and init all - * the other extended state. - */ - state_size = sizeof(struct i387_fxsave_struct); - fx_only = 1; - } else { - state_size = fx_sw_user.xstate_size; - xfeatures = fx_sw_user.xfeatures; - } - } - - if (ia32_fxstate) { - /* - * For 32-bit frames with fxstate, copy the user state to the - * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. - */ - struct fpu *fpu = &tsk->thread.fpu; - struct user_i387_ia32_struct env; - int err = 0; - - /* - * Drop the current fpu which clears fpu->fpstate_active. This ensures - * that any context-switch during the copy of the new state, - * avoids the intermediate state from getting restored/saved. - * Thus avoiding the new restored state from getting corrupted. - * We will be ready to restore/save the state only after - * fpu->fpstate_active is again set. - */ - fpu__drop(fpu); - - if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { - fpstate_init(&fpu->state); - err = -1; - } else { - sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); - } - - fpu->fpstate_active = 1; - if (use_eager_fpu()) { - preempt_disable(); - fpu__restore(); - preempt_enable(); - } - - return err; - } else { - /* - * For 64-bit frames and 32-bit fsave frames, restore the user - * state to the registers directly (with exceptions handled). - */ - user_fpu_begin(); - if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { - fpu__clear(fpu); - return -1; - } - } - - return 0; -} - -static inline int xstate_sigframe_size(void) -{ - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; -} - -/* - * Restore FPU state from a sigframe: - */ -int fpu__restore_sig(void __user *buf, int ia32_frame) -{ - void __user *buf_fx = buf; - int size = xstate_sigframe_size(); - - if (ia32_frame && use_fxsr()) { - buf_fx = buf + sizeof(struct i387_fsave_struct); - size += sizeof(struct i387_fsave_struct); - } - - return __fpu__restore_sig(buf, buf_fx, size); -} - -unsigned long -fpu__alloc_mathframe(unsigned long sp, int ia32_frame, - unsigned long *buf_fx, unsigned long *size) -{ - unsigned long frame_size = xstate_sigframe_size(); - - *buf_fx = sp = round_down(sp - frame_size, 64); - if (ia32_frame && use_fxsr()) { - frame_size += sizeof(struct i387_fsave_struct); - sp -= sizeof(struct i387_fsave_struct); - } - - *size = frame_size; - - return sp; -} -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -static void prepare_fx_sw_frame(void) -{ - int fsave_header_size = sizeof(struct i387_fsave_struct); - int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask; - fx_sw_reserved.xstate_size = xstate_size; - - if (config_enabled(CONFIG_IA32_EMULATION)) { - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; - } -} - /* * Enable the extended processor state save/restore feature. * Called once per CPU onlining. @@ -741,7 +351,7 @@ void fpu__init_system_xstate(void) init_xstate_size(); update_regset_xstate_info(xstate_size, xfeatures_mask); - prepare_fx_sw_frame(); + fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", -- cgit v1.2.1 From 0c306bcfba288296dc34d00d514546915234bc90 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 12:59:30 +0200 Subject: x86/fpu: Factor out the FPU regset code into fpu/regset.c So much of fpu/core.c is the regset code, but it just obscures the generic FPU state machine logic. Factor out the regset code into fpu/regset.c, where it can be read in isolation. This affects one API: fpu__activate_stopped() has to be made available from the core to fpu/regset.c. No change in functionality. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 +- arch/x86/kernel/fpu/Makefile | 2 +- arch/x86/kernel/fpu/core.c | 352 +---------------------------------- arch/x86/kernel/fpu/regset.c | 356 ++++++++++++++++++++++++++++++++++++ 4 files changed, 360 insertions(+), 354 deletions(-) create mode 100644 arch/x86/kernel/fpu/regset.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index f23ea10d3a1f..db6c24ba6d3d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -29,8 +29,6 @@ extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); -extern void fpu__activate_curr(struct fpu *fpu); - extern void fpstate_init(union thread_xstate *state); #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct i387_soft_struct *soft); @@ -49,6 +47,8 @@ extern int fpu__exception_code(struct fpu *fpu, int trap_nr); /* * High level FPU state handling functions: */ +extern void fpu__activate_curr(struct fpu *fpu); +extern void fpu__activate_stopped(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(void); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile index 5c697ded57f2..68279efb811a 100644 --- a/arch/x86/kernel/fpu/Makefile +++ b/arch/x86/kernel/fpu/Makefile @@ -2,4 +2,4 @@ # Build rules for the FPU support code: # -obj-y += init.o bugs.o core.o signal.o xstate.o +obj-y += init.o bugs.o core.o regset.o signal.o xstate.o diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 2cb75bd40dc0..e22711d37db0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -320,7 +320,7 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); * the read-only case, it's not strictly necessary for * read-only access to the context. */ -static void fpu__activate_stopped(struct fpu *child_fpu) +void fpu__activate_stopped(struct fpu *child_fpu) { WARN_ON_ONCE(child_fpu == ¤t->thread.fpu); @@ -425,356 +425,6 @@ void fpu__clear(struct fpu *fpu) } } -/* - * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, - * as the "regset->n" for the xstate regset will be updated based on the feature - * capabilites supported by the xsave. - */ -int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) -{ - struct fpu *target_fpu = &target->thread.fpu; - - return target_fpu->fpstate_active ? regset->n : 0; -} - -int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) -{ - struct fpu *target_fpu = &target->thread.fpu; - - return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0; -} - -int xfpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - struct fpu *fpu = &target->thread.fpu; - - if (!cpu_has_fxsr) - return -ENODEV; - - fpu__activate_stopped(fpu); - fpstate_sanitize_xstate(fpu); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fpu->state.fxsave, 0, -1); -} - -int xfpregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - struct fpu *fpu = &target->thread.fpu; - int ret; - - if (!cpu_has_fxsr) - return -ENODEV; - - fpu__activate_stopped(fpu); - fpstate_sanitize_xstate(fpu); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &fpu->state.fxsave, 0, -1); - - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; - - /* - * update the header bits in the xsave header, indicating the - * presence of FP and SSE state. - */ - if (cpu_has_xsave) - fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE; - - return ret; -} - -int xstateregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - struct fpu *fpu = &target->thread.fpu; - struct xsave_struct *xsave; - int ret; - - if (!cpu_has_xsave) - return -ENODEV; - - fpu__activate_stopped(fpu); - - xsave = &fpu->state.xsave; - - /* - * Copy the 48bytes defined by the software first into the xstate - * memory layout in the thread struct, so that we can copy the entire - * xstateregs to the user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - /* - * Copy the xstate memory layout. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - return ret; -} - -int xstateregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - struct fpu *fpu = &target->thread.fpu; - struct xsave_struct *xsave; - int ret; - - if (!cpu_has_xsave) - return -ENODEV; - - fpu__activate_stopped(fpu); - - xsave = &fpu->state.xsave; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->header.xfeatures &= xfeatures_mask; - /* - * These bits must be zero. - */ - memset(&xsave->header.reserved, 0, 48); - - return ret; -} - -#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION - -/* - * FPU tag word conversions. - */ - -static inline unsigned short twd_i387_to_fxsr(unsigned short twd) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - - return tmp; -} - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) -#define FP_EXP_TAG_VALID 0 -#define FP_EXP_TAG_ZERO 1 -#define FP_EXP_TAG_SPECIAL 2 -#define FP_EXP_TAG_EMPTY 3 - -static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) -{ - struct _fpxreg *st; - u32 tos = (fxsave->swd >> 11) & 7; - u32 twd = (unsigned long) fxsave->twd; - u32 tag; - u32 ret = 0xffff0000u; - int i; - - for (i = 0; i < 8; i++, twd >>= 1) { - if (twd & 0x1) { - st = FPREG_ADDR(fxsave, (i - tos) & 7); - - switch (st->exponent & 0x7fff) { - case 0x7fff: - tag = FP_EXP_TAG_SPECIAL; - break; - case 0x0000: - if (!st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3]) - tag = FP_EXP_TAG_ZERO; - else - tag = FP_EXP_TAG_SPECIAL; - break; - default: - if (st->significand[3] & 0x8000) - tag = FP_EXP_TAG_VALID; - else - tag = FP_EXP_TAG_SPECIAL; - break; - } - } else { - tag = FP_EXP_TAG_EMPTY; - } - ret |= tag << (2 * i); - } - return ret; -} - -/* - * FXSR floating point environment conversions. - */ - -void -convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) -{ - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state.fxsave; - struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; - struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; - int i; - - env->cwd = fxsave->cwd | 0xffff0000u; - env->swd = fxsave->swd | 0xffff0000u; - env->twd = twd_fxsr_to_i387(fxsave); - -#ifdef CONFIG_X86_64 - env->fip = fxsave->rip; - env->foo = fxsave->rdp; - /* - * should be actually ds/cs at fpu exception time, but - * that information is not available in 64bit mode. - */ - env->fcs = task_pt_regs(tsk)->cs; - if (tsk == current) { - savesegment(ds, env->fos); - } else { - env->fos = tsk->thread.ds; - } - env->fos |= 0xffff0000; -#else - env->fip = fxsave->fip; - env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); - env->foo = fxsave->foo; - env->fos = fxsave->fos; -#endif - - for (i = 0; i < 8; ++i) - memcpy(&to[i], &from[i], sizeof(to[0])); -} - -void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env) - -{ - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state.fxsave; - struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; - struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; - int i; - - fxsave->cwd = env->cwd; - fxsave->swd = env->swd; - fxsave->twd = twd_i387_to_fxsr(env->twd); - fxsave->fop = (u16) ((u32) env->fcs >> 16); -#ifdef CONFIG_X86_64 - fxsave->rip = env->fip; - fxsave->rdp = env->foo; - /* cs and ds ignored */ -#else - fxsave->fip = env->fip; - fxsave->fcs = (env->fcs & 0xffff); - fxsave->foo = env->foo; - fxsave->fos = env->fos; -#endif - - for (i = 0; i < 8; ++i) - memcpy(&to[i], &from[i], sizeof(from[0])); -} - -int fpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - struct fpu *fpu = &target->thread.fpu; - struct user_i387_ia32_struct env; - - fpu__activate_stopped(fpu); - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - - if (!cpu_has_fxsr) - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fpu->state.fsave, 0, - -1); - - fpstate_sanitize_xstate(fpu); - - if (kbuf && pos == 0 && count == sizeof(env)) { - convert_from_fxsr(kbuf, target); - return 0; - } - - convert_from_fxsr(&env, target); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); -} - -int fpregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - struct fpu *fpu = &target->thread.fpu; - struct user_i387_ia32_struct env; - int ret; - - fpu__activate_stopped(fpu); - fpstate_sanitize_xstate(fpu); - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - - if (!cpu_has_fxsr) - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &fpu->state.fsave, 0, - -1); - - if (pos > 0 || count < sizeof(env)) - convert_from_fxsr(&env, target); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); - if (!ret) - convert_to_fxsr(target, &env); - - /* - * update the header bit in the xsave header, indicating the - * presence of FP. - */ - if (cpu_has_xsave) - fpu->state.xsave.header.xfeatures |= XSTATE_FP; - return ret; -} - -/* - * FPU state for core dumps. - * This is only used for a.out dumps now. - * It is declared generically using elf_fpregset_t (which is - * struct user_i387_struct) but is in fact only used for 32-bit - * dumps, so on 64-bit it is really struct user_i387_ia32_struct. - */ -int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) -{ - struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; - int fpvalid; - - fpvalid = fpu->fpstate_active; - if (fpvalid) - fpvalid = !fpregs_get(tsk, NULL, - 0, sizeof(struct user_i387_ia32_struct), - ufpu, NULL); - - return fpvalid; -} -EXPORT_SYMBOL(dump_fpu); - -#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ - /* * x87 math exception handling: */ diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c new file mode 100644 index 000000000000..1f58a1c2a941 --- /dev/null +++ b/arch/x86/kernel/fpu/regset.c @@ -0,0 +1,356 @@ +/* + * FPU register's regset abstraction, for ptrace, core dumps, etc. + */ +#include +#include +#include + +/* + * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ +int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + struct fpu *target_fpu = &target->thread.fpu; + + return target_fpu->fpstate_active ? regset->n : 0; +} + +int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + struct fpu *target_fpu = &target->thread.fpu; + + return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0; +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + + if (!cpu_has_fxsr) + return -ENODEV; + + fpu__activate_stopped(fpu); + fpstate_sanitize_xstate(fpu); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &fpu->state.fxsave, 0, -1); +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + fpu__activate_stopped(fpu); + fpstate_sanitize_xstate(fpu); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fpu->state.fxsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; + + /* + * update the header bits in the xsave header, indicating the + * presence of FP and SSE state. + */ + if (cpu_has_xsave) + fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE; + + return ret; +} + +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xsave_struct *xsave; + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + fpu__activate_stopped(fpu); + + xsave = &fpu->state.xsave; + + /* + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). + */ + memcpy(&xsave->i387.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xsave_struct *xsave; + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + fpu__activate_stopped(fpu); + + xsave = &fpu->state.xsave; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + xsave->header.xfeatures &= xfeatures_mask; + /* + * These bits must be zero. + */ + memset(&xsave->header.reserved, 0, 48); + + return ret; +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +void +convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state.fxsave; + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; + if (tsk == current) { + savesegment(ds, env->fos); + } else { + env->fos = tsk->thread.ds; + } + env->fos |= 0xffff0000; +#else + env->fip = fxsave->fip; + env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) + +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state.fxsave; + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct user_i387_ia32_struct env; + + fpu__activate_stopped(fpu); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + + if (!cpu_has_fxsr) + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &fpu->state.fsave, 0, + -1); + + fpstate_sanitize_xstate(fpu); + + if (kbuf && pos == 0 && count == sizeof(env)) { + convert_from_fxsr(kbuf, target); + return 0; + } + + convert_from_fxsr(&env, target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct user_i387_ia32_struct env; + int ret; + + fpu__activate_stopped(fpu); + fpstate_sanitize_xstate(fpu); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + if (!cpu_has_fxsr) + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fpu->state.fsave, 0, + -1); + + if (pos > 0 || count < sizeof(env)) + convert_from_fxsr(&env, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (!ret) + convert_to_fxsr(target, &env); + + /* + * update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (cpu_has_xsave) + fpu->state.xsave.header.xfeatures |= XSTATE_FP; + return ret; +} + +/* + * FPU state for core dumps. + * This is only used for a.out dumps now. + * It is declared generically using elf_fpregset_t (which is + * struct user_i387_struct) but is in fact only used for 32-bit + * dumps, so on 64-bit it is really struct user_i387_ia32_struct. + */ +int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) +{ + struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; + int fpvalid; + + fpvalid = fpu->fpstate_active; + if (fpvalid) + fpvalid = !fpregs_get(tsk, NULL, + 0, sizeof(struct user_i387_ia32_struct), + ufpu, NULL); + + return fpvalid; +} +EXPORT_SYMBOL(dump_fpu); + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ -- cgit v1.2.1 From c47ada305de3803517ae64aa50686f644c5456fa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2015 17:15:32 +0200 Subject: x86/fpu: Harmonize FPU register state types Use these consistent names: struct fregs_state # was: i387_fsave_struct struct fxregs_state # was: i387_fxsave_struct struct swregs_state # was: i387_soft_struct struct xregs_state # was: xsave_struct union fpregs_state # was: thread_xstate Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 22 +++++++++++----------- arch/x86/include/asm/fpu/types.h | 24 ++++++++++++------------ arch/x86/include/asm/fpu/xstate.h | 16 ++++++++-------- arch/x86/include/asm/mpx.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 6 +++--- arch/x86/kernel/fpu/init.c | 8 ++++---- arch/x86/kernel/fpu/regset.c | 10 +++++----- arch/x86/kernel/fpu/signal.c | 32 ++++++++++++++++---------------- arch/x86/kernel/fpu/xstate.c | 6 +++--- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/x86.c | 12 ++++++------ arch/x86/math-emu/fpu_aux.c | 2 +- arch/x86/math-emu/fpu_entry.c | 10 +++++----- arch/x86/mm/mpx.c | 6 +++--- 14 files changed, 82 insertions(+), 82 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index db6c24ba6d3d..7fdc90b9dd86 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -22,20 +22,20 @@ extern unsigned int mxcsr_feature_mask; -extern union thread_xstate init_fpstate; +extern union fpregs_state init_fpstate; extern void fpu__init_cpu(void); extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); -extern void fpstate_init(union thread_xstate *state); +extern void fpstate_init(union fpregs_state *state); #ifdef CONFIG_MATH_EMULATION -extern void fpstate_init_soft(struct i387_soft_struct *soft); +extern void fpstate_init_soft(struct swregs_state *soft); #else -static inline void fpstate_init_soft(struct i387_soft_struct *soft) {} +static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif -static inline void fpstate_init_fxstate(struct i387_fxsave_struct *fx) +static inline void fpstate_init_fxstate(struct fxregs_state *fx) { fx->cwd = 0x37f; fx->mxcsr = MXCSR_DEFAULT; @@ -133,12 +133,12 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) -static inline int copy_fregs_to_user(struct i387_fsave_struct __user *fx) +static inline int copy_fregs_to_user(struct fregs_state __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } -static inline int copy_fxregs_to_user(struct i387_fxsave_struct __user *fx) +static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) { if (config_enabled(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); @@ -149,7 +149,7 @@ static inline int copy_fxregs_to_user(struct i387_fxsave_struct __user *fx) return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); } -static inline int copy_kernel_to_fxregs(struct i387_fxsave_struct *fx) +static inline int copy_kernel_to_fxregs(struct fxregs_state *fx) { if (config_enabled(CONFIG_X86_32)) return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); @@ -161,7 +161,7 @@ static inline int copy_kernel_to_fxregs(struct i387_fxsave_struct *fx) "m" (*fx)); } -static inline int copy_user_to_fxregs(struct i387_fxsave_struct __user *fx) +static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) { if (config_enabled(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); @@ -173,12 +173,12 @@ static inline int copy_user_to_fxregs(struct i387_fxsave_struct __user *fx) "m" (*fx)); } -static inline int copy_kernel_to_fregs(struct i387_fsave_struct *fx) +static inline int copy_kernel_to_fregs(struct fregs_state *fx) { return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline int copy_user_to_fregs(struct i387_fsave_struct __user *fx) +static inline int copy_user_to_fregs(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 006ec2975f6f..fe2ce3276a38 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -4,7 +4,7 @@ #ifndef _ASM_X86_FPU_H #define _ASM_X86_FPU_H -struct i387_fsave_struct { +struct fregs_state { u32 cwd; /* FPU Control Word */ u32 swd; /* FPU Status Word */ u32 twd; /* FPU Tag Word */ @@ -20,7 +20,7 @@ struct i387_fsave_struct { u32 status; }; -struct i387_fxsave_struct { +struct fxregs_state { u16 cwd; /* Control Word */ u16 swd; /* Status Word */ u16 twd; /* Tag Word */ @@ -58,7 +58,7 @@ struct i387_fxsave_struct { /* * Software based FPU emulation state: */ -struct i387_soft_struct { +struct swregs_state { u32 cwd; u32 swd; u32 twd; @@ -109,7 +109,7 @@ enum xfeature_bit { /* * There are 16x 256-bit AVX registers named YMM0-YMM15. * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) - * and are stored in 'struct i387_fxsave_struct::xmm_space[]'. + * and are stored in 'struct fxregs_state::xmm_space[]'. * * The high 128 bits are stored here: * 16x 128 bits == 256 bytes. @@ -140,8 +140,8 @@ struct xstate_header { u64 reserved[6]; } __attribute__((packed)); -struct xsave_struct { - struct i387_fxsave_struct i387; +struct xregs_state { + struct fxregs_state i387; struct xstate_header header; struct ymmh_struct ymmh; struct lwp_struct lwp; @@ -150,11 +150,11 @@ struct xsave_struct { /* New processor state extensions will go here. */ } __attribute__ ((packed, aligned (64))); -union thread_xstate { - struct i387_fsave_struct fsave; - struct i387_fxsave_struct fxsave; - struct i387_soft_struct soft; - struct xsave_struct xsave; +union fpregs_state { + struct fregs_state fsave; + struct fxregs_state fxsave; + struct swregs_state soft; + struct xregs_state xsave; }; struct fpu { @@ -171,7 +171,7 @@ struct fpu { unsigned int last_cpu; unsigned int fpregs_active; - union thread_xstate state; + union fpregs_state state; /* * This counter contains the number of consecutive context switches * during which the FPU stays used. If this is over a threshold, the diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 7f59480697a3..a6181b9ebf42 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -58,7 +58,7 @@ extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int copy_xregs_to_kernel_booting(struct xsave_struct *fx) +static inline int copy_xregs_to_kernel_booting(struct xregs_state *fx) { u64 mask = -1; u32 lmask = mask; @@ -86,7 +86,7 @@ static inline int copy_xregs_to_kernel_booting(struct xsave_struct *fx) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int copy_kernel_to_xregs_booting(struct xsave_struct *fx, u64 mask) +static inline int copy_kernel_to_xregs_booting(struct xregs_state *fx, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; @@ -112,7 +112,7 @@ static inline int copy_kernel_to_xregs_booting(struct xsave_struct *fx, u64 mask /* * Save processor xstate to xsave area. */ -static inline int copy_xregs_to_kernel(struct xsave_struct *fx) +static inline int copy_xregs_to_kernel(struct xregs_state *fx) { u64 mask = -1; u32 lmask = mask; @@ -151,7 +151,7 @@ static inline int copy_xregs_to_kernel(struct xsave_struct *fx) /* * Restore processor xstate from xsave area. */ -static inline int copy_kernel_to_xregs(struct xsave_struct *fx, u64 mask) +static inline int copy_kernel_to_xregs(struct xregs_state *fx, u64 mask) { int err = 0; u32 lmask = mask; @@ -186,7 +186,7 @@ static inline int copy_kernel_to_xregs(struct xsave_struct *fx, u64 mask) * backward compatibility for old applications which don't understand * compacted format of xsave area. */ -static inline int copy_xregs_to_user(struct xsave_struct __user *buf) +static inline int copy_xregs_to_user(struct xregs_state __user *buf) { int err; @@ -210,10 +210,10 @@ static inline int copy_xregs_to_user(struct xsave_struct __user *buf) /* * Restore xstate from user space xsave area. */ -static inline int copy_user_to_xregs(struct xsave_struct __user *buf, u64 mask) +static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) { int err = 0; - struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); + struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; u32 hmask = mask >> 32; @@ -226,7 +226,7 @@ static inline int copy_user_to_xregs(struct xsave_struct __user *buf, u64 mask) return err; } -void *get_xsave_addr(struct xsave_struct *xsave, int xstate); +void *get_xsave_addr(struct xregs_state *xsave, int xstate); void setup_xstate_comp(void); #endif diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index a952a13d59a7..f3c1b71d4fae 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -60,8 +60,8 @@ #ifdef CONFIG_X86_INTEL_MPX siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xsave_struct *xsave_buf); -int mpx_handle_bd_fault(struct xsave_struct *xsave_buf); + struct xregs_state *xsave_buf); +int mpx_handle_bd_fault(struct xregs_state *xsave_buf); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); @@ -78,11 +78,11 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xsave_struct *xsave_buf) + struct xregs_state *xsave_buf) { return NULL; } -static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +static inline int mpx_handle_bd_fault(struct xregs_state *xsave_buf) { return -EINVAL; } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e22711d37db0..ac39616cb021 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -16,7 +16,7 @@ * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ -union thread_xstate init_fpstate __read_mostly; +union fpregs_state init_fpstate __read_mostly; /* * Track whether the kernel is using the FPU state @@ -200,7 +200,7 @@ EXPORT_SYMBOL_GPL(fpu__save); /* * Legacy x87 fpstate state init: */ -static inline void fpstate_init_fstate(struct i387_fsave_struct *fp) +static inline void fpstate_init_fstate(struct fregs_state *fp) { fp->cwd = 0xffff037fu; fp->swd = 0xffff0000u; @@ -208,7 +208,7 @@ static inline void fpstate_init_fstate(struct i387_fsave_struct *fp) fp->fos = 0xffff0000u; } -void fpstate_init(union thread_xstate *state) +void fpstate_init(union fpregs_state *state) { if (!cpu_has_fpu) { fpstate_init_soft(&state->soft); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 93bc11a5812c..504370662899 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -95,7 +95,7 @@ static void fpu__init_system_mxcsr(void) unsigned int mask = 0; if (cpu_has_fxsr) { - struct i387_fxsave_struct fx_tmp __aligned(32) = { }; + struct fxregs_state fx_tmp __aligned(32) = { }; asm volatile("fxsave %0" : "+m" (fx_tmp)); @@ -155,12 +155,12 @@ static void fpu__init_system_xstate_size_legacy(void) */ setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - xstate_size = sizeof(struct i387_soft_struct); + xstate_size = sizeof(struct swregs_state); } else { if (cpu_has_fxsr) - xstate_size = sizeof(struct i387_fxsave_struct); + xstate_size = sizeof(struct fxregs_state); else - xstate_size = sizeof(struct i387_fsave_struct); + xstate_size = sizeof(struct fregs_state); } } diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 1f58a1c2a941..297b3da8e4c4 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -76,7 +76,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, void *kbuf, void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; - struct xsave_struct *xsave; + struct xregs_state *xsave; int ret; if (!cpu_has_xsave) @@ -105,7 +105,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; - struct xsave_struct *xsave; + struct xregs_state *xsave; int ret; if (!cpu_has_xsave) @@ -156,7 +156,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd) #define FP_EXP_TAG_SPECIAL 2 #define FP_EXP_TAG_EMPTY 3 -static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave) { struct _fpxreg *st; u32 tos = (fxsave->swd >> 11) & 7; @@ -204,7 +204,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state.fxsave; + struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -242,7 +242,7 @@ void convert_to_fxsr(struct task_struct *tsk, const struct user_i387_ia32_struct *env) { - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state.fxsave; + struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; int i; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 8d0c26ab5123..99f73093333d 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -17,11 +17,11 @@ static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, +static inline int check_for_xstate(struct fxregs_state __user *buf, void __user *fpstate, struct _fpx_sw_bytes *fx_sw) { - int min_xstate_size = sizeof(struct i387_fxsave_struct) + + int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); unsigned int magic2; @@ -54,7 +54,7 @@ static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; struct user_i387_ia32_struct env; struct _fpstate_ia32 __user *fp = buf; @@ -65,7 +65,7 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) __put_user(X86_FXSR_MAGIC, &fp->magic)) return -1; } else { - struct i387_fsave_struct __user *fp = buf; + struct fregs_state __user *fp = buf; u32 swd; if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) return -1; @@ -76,7 +76,7 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) static inline int save_xstate_epilog(void __user *buf, int ia32_frame) { - struct xsave_struct __user *x = buf; + struct xregs_state __user *x = buf; struct _fpx_sw_bytes *sw_bytes; u32 xfeatures; int err; @@ -114,16 +114,16 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) return err; } -static inline int copy_fpregs_to_sigframe(struct xsave_struct __user *buf) +static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) { int err; if (use_xsave()) err = copy_xregs_to_user(buf); else if (use_fxsr()) - err = copy_fxregs_to_user((struct i387_fxsave_struct __user *) buf); + err = copy_fxregs_to_user((struct fxregs_state __user *) buf); else - err = copy_fregs_to_user((struct i387_fsave_struct __user *) buf); + err = copy_fregs_to_user((struct fregs_state __user *) buf); if (unlikely(err) && __clear_user(buf, xstate_size)) err = -EFAULT; @@ -152,7 +152,7 @@ static inline int copy_fpregs_to_sigframe(struct xsave_struct __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct xsave_struct *xsave = ¤t->thread.fpu.state.xsave; + struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -195,7 +195,7 @@ sanitize_restored_xstate(struct task_struct *tsk, struct user_i387_ia32_struct *ia32_env, u64 xfeatures, int fx_only) { - struct xsave_struct *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; struct xstate_header *header = &xsave->header; if (use_xsave()) { @@ -280,7 +280,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * memory layout. Restore just the FP/SSE and init all * the other extended state. */ - state_size = sizeof(struct i387_fxsave_struct); + state_size = sizeof(struct fxregs_state); fx_only = 1; } else { state_size = fx_sw_user.xstate_size; @@ -353,8 +353,8 @@ int fpu__restore_sig(void __user *buf, int ia32_frame) int size = xstate_sigframe_size(); if (ia32_frame && use_fxsr()) { - buf_fx = buf + sizeof(struct i387_fsave_struct); - size += sizeof(struct i387_fsave_struct); + buf_fx = buf + sizeof(struct fregs_state); + size += sizeof(struct fregs_state); } return __fpu__restore_sig(buf, buf_fx, size); @@ -368,8 +368,8 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { - frame_size += sizeof(struct i387_fsave_struct); - sp -= sizeof(struct i387_fsave_struct); + frame_size += sizeof(struct fregs_state); + sp -= sizeof(struct fregs_state); } *size = frame_size; @@ -385,7 +385,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int fsave_header_size = sizeof(struct i387_fsave_struct); + int fsave_header_size = sizeof(struct fregs_state); int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; if (config_enabled(CONFIG_X86_32)) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 3629e2ef3c94..733a8aec7bd7 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(cpu_has_xfeatures); */ void fpstate_sanitize_xstate(struct fpu *fpu) { - struct i387_fxsave_struct *fx = &fpu->state.fxsave; + struct fxregs_state *fx = &fpu->state.fxsave; int feature_bit; u64 xfeatures; @@ -231,7 +231,7 @@ void setup_xstate_comp(void) * or standard form. */ xstate_comp_offsets[0] = 0; - xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); + xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); if (!cpu_has_xsaves) { for (i = 2; i < xfeatures_nr; i++) { @@ -386,7 +386,7 @@ void fpu__resume_cpu(void) * Output: * address of the state in the xsave area. */ -void *get_xsave_addr(struct xsave_struct *xsave, int xstate) +void *get_xsave_addr(struct xregs_state *xsave, int xstate) { int feature = fls64(xstate) - 1; if (!test_bit(feature, (unsigned long *)&xfeatures_mask)) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index cab397d0085f..6f581c65c648 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -371,7 +371,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - struct xsave_struct *xsave_buf; + struct xregs_state *xsave_buf; enum ctx_state prev_state; struct bndcsr *bndcsr; siginfo_t *info; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d811bb2728f..e14a7a65e975 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3195,7 +3195,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { - struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; u64 xstate_bv = xsave->header.xfeatures; u64 valid; @@ -3231,7 +3231,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { - struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; @@ -3277,7 +3277,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, } else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state.fxsave, - sizeof(struct i387_fxsave_struct)); + sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XSTATE_FPSSE; } @@ -3302,7 +3302,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state.fxsave, - guest_xsave->region, sizeof(struct i387_fxsave_struct)); + guest_xsave->region, sizeof(struct fxregs_state)); } return 0; } @@ -6970,7 +6970,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct i387_fxsave_struct *fxsave = + struct fxregs_state *fxsave = &vcpu->arch.guest_fpu.state.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); @@ -6987,7 +6987,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct i387_fxsave_struct *fxsave = + struct fxregs_state *fxsave = &vcpu->arch.guest_fpu.state.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 768b2b8271d6..dd76a05729b0 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -30,7 +30,7 @@ static void fclex(void) } /* Needs to be externally visible */ -void fpstate_init_soft(struct i387_soft_struct *soft) +void fpstate_init_soft(struct swregs_state *soft) { struct address *oaddr, *iaddr; memset(soft, 0, sizeof(*soft)); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 5b850514eb68..f37e84ab49f3 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -669,7 +669,7 @@ void math_abort(struct math_emu_info *info, unsigned int signal) #endif /* PARANOID */ } -#define S387 ((struct i387_soft_struct *)s387) +#define S387 ((struct swregs_state *)s387) #define sstatus_word() \ ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top)) @@ -678,14 +678,14 @@ int fpregs_soft_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct i387_soft_struct *s387 = &target->thread.fpu.state.soft; + struct swregs_state *s387 = &target->thread.fpu.state.soft; void *space = s387->st_space; int ret; int offset, other, i, tags, regnr, tag, newtop; RE_ENTRANT_CHECK_OFF; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0, - offsetof(struct i387_soft_struct, st_space)); + offsetof(struct swregs_state, st_space)); RE_ENTRANT_CHECK_ON; if (ret) @@ -730,7 +730,7 @@ int fpregs_soft_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - struct i387_soft_struct *s387 = &target->thread.fpu.state.soft; + struct swregs_state *s387 = &target->thread.fpu.state.soft; const void *space = s387->st_space; int ret; int offset = (S387->ftop & 7) * 10, other = 80 - offset; @@ -748,7 +748,7 @@ int fpregs_soft_get(struct task_struct *target, #endif /* PECULIAR_486 */ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0, - offsetof(struct i387_soft_struct, st_space)); + offsetof(struct swregs_state, st_space)); /* Copy all registers in stack order. */ if (!ret) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 5e20bacee210..2e0dfd39bd22 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -272,7 +272,7 @@ bad_opcode: * The caller is expected to kfree() the returned siginfo_t. */ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xsave_struct *xsave_buf) + struct xregs_state *xsave_buf) { struct bndreg *bndregs, *bndreg; siginfo_t *info = NULL; @@ -497,7 +497,7 @@ out_unmap: * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, * and the size of each bound table is 4MB. */ -static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) +static int do_mpx_bt_fault(struct xregs_state *xsave_buf) { unsigned long bd_entry, bd_base; struct bndcsr *bndcsr; @@ -525,7 +525,7 @@ static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) return allocate_bt((long __user *)bd_entry); } -int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +int mpx_handle_bd_fault(struct xregs_state *xsave_buf) { /* * Userspace never asked us to manage the bounds tables, -- cgit v1.2.1 From aeb997b9f2a2199c72b89b7a304cafc394e4202b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 1 May 2015 09:59:04 +0200 Subject: x86/fpu: Change fpu->fpregs_active from 'int' to 'char', add lazy switching comments Improve the memory layout of 'struct fpu': - change ->fpregs_active from 'int' to 'char' - it's just a single flag and modern x86 CPUs can do efficient byte accesses. - pack related fields closer to each other: often 'fpu->state' will not be touched, while the other fields will - so pack them into a group. Also add comments to each field, describing their purpose, and add some background information about lazy restores. Also fix an obsolete, lazy switching related comment in fpu_copy()'s description. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 82 +++++++++++++++++++++++++++++++++++----- arch/x86/kernel/fpu/core.c | 6 +-- arch/x86/kernel/fpu/xstate.c | 9 +++-- 3 files changed, 79 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index fe2ce3276a38..261cfb76065f 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -159,8 +159,44 @@ union fpregs_state { struct fpu { /* + * @state: + * + * In-memory copy of all FPU registers that we save/restore + * over context switches. If the task is using the FPU then + * the registers in the FPU are more recent than this state + * copy. If the task context-switches away then they get + * saved here and represent the FPU state. + * + * After context switches there may be a (short) time period + * during which the in-FPU hardware registers are unchanged + * and still perfectly match this state, if the tasks + * scheduled afterwards are not using the FPU. + * + * This is the 'lazy restore' window of optimization, which + * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. + * + * We detect whether a subsequent task uses the FPU via setting + * CR0::TS to 1, which causes any FPU use to raise a #NM fault. + * + * During this window, if the task gets scheduled again, we + * might be able to skip having to do a restore from this + * memory buffer to the hardware registers - at the cost of + * incurring the overhead of #NM fault traps. + * + * Note that on modern CPUs that support the XSAVEOPT (or other + * optimized XSAVE instructions), we don't use #NM traps anymore, + * as the hardware can track whether FPU registers need saving + * or not. On such CPUs we activate the non-lazy ('eagerfpu') + * logic, which unconditionally saves/restores all FPU state + * across context switches. (if FPU state exists.) + */ + union fpregs_state state; + + /* + * @last_cpu: + * * Records the last CPU on which this context was loaded into - * FPU registers. (In the lazy-switching case we might be + * FPU registers. (In the lazy-restore case we might be * able to reuse FPU registers across multiple context switches * this way, if no intermediate task used the FPU.) * @@ -170,23 +206,49 @@ struct fpu { */ unsigned int last_cpu; - unsigned int fpregs_active; - union fpregs_state state; /* + * @fpstate_active: + * + * This flag indicates whether this context is active: if the task + * is not running then we can restore from this context, if the task + * is running then we should save into this context. + */ + unsigned char fpstate_active; + + /* + * @fpregs_active: + * + * This flag determines whether a given context is actively + * loaded into the FPU's registers and that those registers + * represent the task's current FPU state. + * + * Note the interaction with fpstate_active: + * + * # task does not use the FPU: + * fpstate_active == 0 + * + * # task uses the FPU and regs are active: + * fpstate_active == 1 && fpregs_active == 1 + * + * # the regs are inactive but still match fpstate: + * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu + * + * The third state is what we use for the lazy restore optimization + * on lazy-switching CPUs. + */ + unsigned char fpregs_active; + + /* + * @counter: + * * This counter contains the number of consecutive context switches * during which the FPU stays used. If this is over a threshold, the - * lazy fpu saving logic becomes unlazy, to save the trap overhead. + * lazy FPU restore logic becomes eager, to save the trap overhead. * This is an unsigned char so that after 256 iterations the counter * wraps and the context switch behavior turns lazy again; this is to * deal with bursty apps that only use the FPU for a short time: */ unsigned char counter; - /* - * This flag indicates whether this context is fpstate_active: if the task is - * not running then we can restore from this context, if the task - * is running then we should save into this context. - */ - unsigned char fpstate_active; }; #endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ac39616cb021..97df457784aa 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -227,10 +227,8 @@ EXPORT_SYMBOL_GPL(fpstate_init); /* * Copy the current task's FPU state to a new task's FPU context. * - * In the 'eager' case we just save to the destination context. - * - * In the 'lazy' case we save to the source context, mark the FPU lazy - * via stts() and copy the source context into the destination context. + * In both the 'eager' and the 'lazy' case we save hardware registers + * directly to the destination buffer. */ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 733a8aec7bd7..cd7f1a6bd933 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -76,10 +76,11 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) EXPORT_SYMBOL_GPL(cpu_has_xfeatures); /* - * When executing XSAVEOPT (optimized XSAVE), if a processor implementation - * detects that an FPU state component is still (or is again) in its - * initialized state, it may clear the corresponding bit in the header.xfeatures - * field, and can skip the writeout of registers to the corresponding memory layout. + * When executing XSAVEOPT (or other optimized XSAVE instructions), if + * a processor implementation detects that an FPU state component is still + * (or is again) in its initialized state, it may clear the corresponding + * bit in the header.xfeatures field, and can skip the writeout of registers + * to the corresponding memory layout. * * This means that when the bit is zero, the state component might still contain * some previous - non-initialized register state. -- cgit v1.2.1 From bdf80d104021d3405e5ec8ecc08f189b74ae022a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 2 May 2015 10:22:45 +0200 Subject: x86/fpu: Document the various fpregs state formats Document all the structures that make up 'struct fpu'. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 261cfb76065f..4c4eceb08a42 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -4,6 +4,10 @@ #ifndef _ASM_X86_FPU_H #define _ASM_X86_FPU_H +/* + * The legacy x87 FPU state format, as saved by FSAVE and + * restored by the FRSTOR instructions: + */ struct fregs_state { u32 cwd; /* FPU Control Word */ u32 swd; /* FPU Status Word */ @@ -16,10 +20,16 @@ struct fregs_state { /* 8*10 bytes for each FP-reg = 80 bytes: */ u32 st_space[20]; - /* Software status information [not touched by FSAVE ]: */ + /* Software status information [not touched by FSAVE]: */ u32 status; }; +/* + * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and + * restored by the FXRSTOR instructions. It's similar to the FSAVE + * format, but differs in some areas, plus has extensions at + * the end for the XMM registers. + */ struct fxregs_state { u16 cwd; /* Control Word */ u16 swd; /* Status Word */ @@ -56,7 +66,8 @@ struct fxregs_state { } __attribute__((aligned(16))); /* - * Software based FPU emulation state: + * Software based FPU emulation state. This is arbitrary really, + * it matches the x87 format to make it easier to understand: */ struct swregs_state { u32 cwd; @@ -140,6 +151,14 @@ struct xstate_header { u64 reserved[6]; } __attribute__((packed)); +/* + * This is our most modern FPU state format, as saved by the XSAVE + * and restored by the XRSTOR instructions. + * + * It consists of a legacy fxregs portion, an xstate header and + * subsequent fixed size areas as defined by the xstate header. + * Not all CPUs support all the extensions. + */ struct xregs_state { struct fxregs_state i387; struct xstate_header header; @@ -150,6 +169,13 @@ struct xregs_state { /* New processor state extensions will go here. */ } __attribute__ ((packed, aligned (64))); +/* + * This is a union of all the possible FPU state formats + * put together, so that we can pick the right one runtime. + * + * The size of the structure is determined by the largest + * member - which is the xsave area: + */ union fpregs_state { struct fregs_state fsave; struct fxregs_state fxsave; @@ -157,6 +183,11 @@ union fpregs_state { struct xregs_state xsave; }; +/* + * Highest level per task FPU state data structure that + * contains the FPU register state plus various FPU + * state fields: + */ struct fpu { /* * @state: -- cgit v1.2.1 From 63c6680cd09b8803f428ec606d764229cb78eca6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 1 May 2015 10:54:22 +0200 Subject: x86/fpu: Move debugging check from kernel_fpu_begin() to __kernel_fpu_begin() kernel_fpu_begin() is __kernel_fpu_begin() with a preempt_disable(). Move the kernel_fpu_begin() debugging check into __kernel_fpu_begin(), so that users of __kernel_fpu_begin() may benefit from it as well. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 97df457784aa..d67558cdbddd 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -109,6 +109,8 @@ void __kernel_fpu_begin(void) { struct fpu *fpu = ¤t->thread.fpu; + WARN_ON_ONCE(!irq_fpu_usable()); + kernel_fpu_disable(); if (fpu->fpregs_active) { @@ -138,7 +140,6 @@ EXPORT_SYMBOL(__kernel_fpu_end); void kernel_fpu_begin(void) { preempt_disable(); - WARN_ON_ONCE(!irq_fpu_usable()); __kernel_fpu_begin(); } EXPORT_SYMBOL_GPL(kernel_fpu_begin); -- cgit v1.2.1 From 39f1acd243725b4b20caa37a81b6c8f641b505b8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2015 07:37:47 +0200 Subject: x86/fpu/xstate: Don't assume the first zero xfeatures zero bit means the end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current xstate code in setup_xstate_features() assumes that the first zero bit means the end of xfeatures - but that is not so, the SDM clearly states that an arbitrary set of xfeatures might be enabled - and it is also clear from the description of the compaction feature that holes are possible: "13-6 Vol. 1MANAGING STATE USING THE XSAVE FEATURE SET [...] Compacted format. Each state component i (i ≥ 2) is located at a byte offset from the base address of the XSAVE area based on the XCOMP_BV field in the XSAVE header: — If XCOMP_BV[i] = 0, state component i is not in the XSAVE area. — If XCOMP_BV[i] = 1, the following items apply: • If XCOMP_BV[j] = 0 for every j, 2 ≤ j < i, state component i is located at a byte offset 576 from the base address of the XSAVE area. (This item applies if i is the first bit set in bits 62:2 of the XCOMP_BV; it implies that state component i is located at the beginning of the extended region.) • Otherwise, let j, 2 ≤ j < i, be the greatest value such that XCOMP_BV[j] = 1. Then state component i is located at a byte offset X from the location of state component j, where X is the number of bytes required for state component j as enumerated in CPUID.(EAX=0DH,ECX=j):EAX. (This item implies that state component i immediately follows the preceding state component whose bit is set in XCOMP_BV.)" So don't assume that the first zero xfeatures bit means the end of all xfeatures - iterate through all of them. I'm not aware of hardware that triggers this currently. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index cd7f1a6bd933..a024fa591a93 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -168,26 +168,27 @@ void fpu__init_cpu_xstate(void) } /* - * Record the offsets and sizes of different state managed by the xsave - * memory layout. + * Record the offsets and sizes of various xstates contained + * in the XSAVE state memory layout. + * + * ( Note that certain features might be non-present, for them + * we'll have 0 offset and 0 size. ) */ static void __init setup_xstate_features(void) { - int eax, ebx, ecx, edx, leaf = 0x2; + u32 eax, ebx, ecx, edx, leaf; xfeatures_nr = fls64(xfeatures_mask); - do { + for (leaf = 2; leaf < xfeatures_nr; leaf++) { cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); - if (eax == 0) - break; - xstate_offsets[leaf] = ebx; xstate_sizes[leaf] = eax; + printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax); leaf++; - } while (1); + } } static void print_xstate_feature(u64 xstate_mask) -- cgit v1.2.1 From 489e9c018854158722eb7123f9efefe67af5c4bd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2015 09:04:56 +0200 Subject: x86/fpu: Clean up xstate feature reservation Put MPX support into its separate high level structure, and also replace the fixed YMM, LWP and MPX structures in xregs_state with just reservations - their exact offsets in the structure will depend on the CPU and no code actually relies on those fields. No change in functionality. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 4c4eceb08a42..02241c2a10e9 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -145,12 +145,21 @@ struct bndcsr { u64 bndstatus; } __packed; +struct mpx_struct { + struct bndreg bndreg[4]; + struct bndcsr bndcsr; +}; + struct xstate_header { u64 xfeatures; u64 xcomp_bv; u64 reserved[6]; } __attribute__((packed)); +/* New processor state extensions should be added here: */ +#define XSTATE_RESERVE (sizeof(struct ymmh_struct) + \ + sizeof(struct lwp_struct) + \ + sizeof(struct mpx_struct) ) /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. @@ -162,11 +171,7 @@ struct xstate_header { struct xregs_state { struct fxregs_state i387; struct xstate_header header; - struct ymmh_struct ymmh; - struct lwp_struct lwp; - struct bndreg bndreg[4]; - struct bndcsr bndcsr; - /* New processor state extensions will go here. */ + u8 __reserved[XSTATE_RESERVE]; } __attribute__ ((packed, aligned (64))); /* -- cgit v1.2.1 From 5fd402dfa7fc97f8e8d74c92d24abadbdc4002ca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2015 09:43:55 +0200 Subject: x86/fpu/xstate: Clean up setup_xstate_comp() call So call setup_xstate_comp() from the xstate init code, not from the generic fpu__init_system() code. This allows us to remove the protytype from xstate.h as well. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 1 - arch/x86/kernel/fpu/init.c | 1 - arch/x86/kernel/fpu/xstate.c | 6 ++---- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index a6181b9ebf42..8f336d2ae126 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -227,6 +227,5 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) } void *get_xsave_addr(struct xregs_state *xsave, int xstate); -void setup_xstate_comp(void); #endif diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 504370662899..889025217407 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -262,7 +262,6 @@ void fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); - setup_xstate_comp(); fpu__init_system_ctx_switch(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a024fa591a93..9e77332f00e4 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -218,11 +218,8 @@ static void print_xstate_features(void) * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format * of the xsave aread. - * - * Input: void - * Output: void */ -void setup_xstate_comp(void) +static void setup_xstate_comp(void) { unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; int i; @@ -355,6 +352,7 @@ void fpu__init_system_xstate(void) update_regset_xstate_info(xstate_size, xfeatures_mask); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); + setup_xstate_comp(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", xfeatures_mask, -- cgit v1.2.1 From 32231879f66162352fc6f3041c5c2b1d965879b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2015 09:52:42 +0200 Subject: x86/fpu/init: Propagate __init annotations Now that all the FPU init function call dependencies are cleaned up we can propagate __init annotations deeper. This shrinks the runtime size of the kernel a bit, and also addresses a few section warnings. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/bugs.c | 2 +- arch/x86/kernel/fpu/init.c | 12 ++++++------ arch/x86/kernel/fpu/xstate.c | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 449b5f3f4925..dd9ca9b60ff3 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -60,7 +60,7 @@ static void __init check_fpu(void) } } -void fpu__init_check_bugs(void) +void __init fpu__init_check_bugs(void) { /* * kernel_fpu_begin/end() in check_fpu() relies on the patched diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 889025217407..a9e506a99a83 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -90,7 +90,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; -static void fpu__init_system_mxcsr(void) +static void __init fpu__init_system_mxcsr(void) { unsigned int mask = 0; @@ -115,7 +115,7 @@ static void fpu__init_system_mxcsr(void) /* * Once per bootup FPU initialization sequences that will run on most x86 CPUs: */ -static void fpu__init_system_generic(void) +static void __init fpu__init_system_generic(void) { /* * Set up the legacy init FPU context. (xstate init might overwrite this @@ -141,7 +141,7 @@ EXPORT_SYMBOL_GPL(xstate_size); * We set this up first, and later it will be overwritten by * fpu__init_system_xstate() if the CPU knows about xstates. */ -static void fpu__init_system_xstate_size_legacy(void) +static void __init fpu__init_system_xstate_size_legacy(void) { /* * Note that xstate_size might be overwriten later during @@ -212,7 +212,7 @@ __setup("eagerfpu=", eager_fpu_setup); /* * Pick the FPU context switching strategy: */ -static void fpu__init_system_ctx_switch(void) +static void __init fpu__init_system_ctx_switch(void) { WARN_ON(current->thread.fpu.fpstate_active); current_thread_info()->status = 0; @@ -234,14 +234,14 @@ static void fpu__init_system_ctx_switch(void) if (eagerfpu == ENABLE) setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); - printk_once(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); + printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); } /* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ -void fpu__init_system(struct cpuinfo_x86 *c) +void __init fpu__init_system(struct cpuinfo_x86 *c) { fpu__init_system_early_generic(c); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9e77332f00e4..201f08feb259 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -191,7 +191,7 @@ static void __init setup_xstate_features(void) } } -static void print_xstate_feature(u64 xstate_mask) +static void __init print_xstate_feature(u64 xstate_mask) { const char *feature_name; @@ -202,7 +202,7 @@ static void print_xstate_feature(u64 xstate_mask) /* * Print out all the supported xstate features: */ -static void print_xstate_features(void) +static void __init print_xstate_features(void) { print_xstate_feature(XSTATE_FP); print_xstate_feature(XSTATE_SSE); @@ -219,7 +219,7 @@ static void print_xstate_features(void) * xsave area. This supports both standard format and compacted format * of the xsave aread. */ -static void setup_xstate_comp(void) +static void __init setup_xstate_comp(void) { unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; int i; @@ -260,7 +260,7 @@ static void setup_xstate_comp(void) /* * setup the xstate image representing the init state */ -static void setup_init_fpu_buf(void) +static void __init setup_init_fpu_buf(void) { if (!cpu_has_xsave) return; @@ -314,7 +314,7 @@ static void __init init_xstate_size(void) * * ( Not marked __init because of false positive section warnings. ) */ -void fpu__init_system_xstate(void) +void __init fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; -- cgit v1.2.1 From e1884d69f643c743806ebb9bc9292863ef39e894 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2015 11:49:58 +0200 Subject: x86/fpu: Pass 'struct fpu' to fpu__restore() This cleans up the call sites and the function a bit, and also makes it more symmetric with the other high level FPU state handling functions. It's still only valid for the current task, as we copy to the FPU registers of the current CPU. No change in functionality. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 9 +++------ arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/traps.c | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7fdc90b9dd86..a4c1b7dbf70e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -50,7 +50,7 @@ extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern void fpu__activate_curr(struct fpu *fpu); extern void fpu__activate_stopped(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); -extern void fpu__restore(void); +extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d67558cdbddd..c0661604a258 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -343,11 +343,8 @@ void fpu__activate_stopped(struct fpu *child_fpu) * with local interrupts disabled, as it is in the case of * do_device_not_available()). */ -void fpu__restore(void) +void fpu__restore(struct fpu *fpu) { - struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; - fpu__activate_curr(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ @@ -355,9 +352,9 @@ void fpu__restore(void) fpregs_activate(fpu); if (unlikely(copy_fpstate_to_fpregs(fpu))) { fpu__clear(fpu); - force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, current); } else { - tsk->thread.fpu.counter++; + fpu->counter++; } kernel_fpu_enable(); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 99f73093333d..50ec9af1bd51 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -319,7 +319,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu->fpstate_active = 1; if (use_eager_fpu()) { preempt_disable(); - fpu__restore(); + fpu__restore(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6f581c65c648..a2510f230195 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -803,7 +803,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) return; } #endif - fpu__restore(); /* interrupts still off */ + fpu__restore(¤t->thread.fpu); /* interrupts still off */ #ifdef CONFIG_X86_32 conditional_sti(regs); #endif -- cgit v1.2.1 From d364a7656c1855c940dfa4baf4ebcc3c6a9e6fd2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 5 May 2015 11:43:02 +0200 Subject: x86/fpu: Fix the 'nofxsr' boot parameter to also clear X86_FEATURE_FXSR_OPT I tried to simulate an ancient CPU via this option, and found that it still has fxsr_opt enabled, confusing the FPU code. Make the 'nofxsr' option also clear FXSR_OPT flag. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d15610b0a4cf..d6fe512441e5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -171,6 +171,15 @@ static int __init x86_xsaves_setup(char *s) } __setup("noxsaves", x86_xsaves_setup); +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; @@ -182,14 +191,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -static int __init x86_fxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_XMM); - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - static int __init x86_sep_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_SEP); -- cgit v1.2.1 From e97131a8391e9fce5126ed54dc66c9d8965d3b4e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 5 May 2015 11:34:49 +0200 Subject: x86/fpu: Add CONFIG_X86_DEBUG_FPU=y FPU debugging code There are various internal FPU state debugging checks that never trigger in practice, but which are useful for FPU code development. Separate these out into CONFIG_X86_DEBUG_FPU=y, and also add a couple of new ones. The size difference is about 0.5K of code on defconfig: text data bss filename 15028906 2578816 1638400 vmlinux 15029430 2578816 1638400 vmlinux ( Keep this enabled by default until the new FPU code is debugged. ) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 12 ++++++++++++ arch/x86/include/asm/fpu/internal.h | 17 ++++++++++++++++- arch/x86/kernel/fpu/core.c | 18 +++++++++--------- arch/x86/kernel/fpu/init.c | 12 +++++++++++- arch/x86/kernel/fpu/xstate.c | 11 ++++++++++- 5 files changed, 58 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 72484a645f05..2fd3ebbb4e33 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -332,4 +332,16 @@ config X86_DEBUG_STATIC_CPU_HAS If unsure, say N. +config X86_DEBUG_FPU + bool "Debug the x86 FPU code" + depends on DEBUG_KERNEL + default y + ---help--- + If this option is enabled then there will be extra sanity + checks and (boot time) debug printouts added to the kernel. + This debugging adds some small amount of runtime overhead + to the kernel. + + If unsure, say N. + endmenu diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index a4c1b7dbf70e..d2a281bd5f45 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -59,6 +59,15 @@ extern void fpu__clear(struct fpu *fpu); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); +/* + * Debugging facility: + */ +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ 0; }) +#endif + DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* @@ -296,6 +305,8 @@ static inline void __fpregs_deactivate_hw(void) /* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ static inline void __fpregs_deactivate(struct fpu *fpu) { + WARN_ON_FPU(!fpu->fpregs_active); + fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); } @@ -303,6 +314,8 @@ static inline void __fpregs_deactivate(struct fpu *fpu) /* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ static inline void __fpregs_activate(struct fpu *fpu) { + WARN_ON_FPU(fpu->fpregs_active); + fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); } @@ -433,8 +446,10 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) { if (fpu_switch.preload) { - if (unlikely(copy_fpstate_to_fpregs(new_fpu))) + if (unlikely(copy_fpstate_to_fpregs(new_fpu))) { + WARN_ON_FPU(1); fpu__clear(new_fpu); + } } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c0661604a258..01a15503c3be 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -38,13 +38,13 @@ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); static void kernel_fpu_disable(void) { - WARN_ON(this_cpu_read(in_kernel_fpu)); + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); } static void kernel_fpu_enable(void) { - WARN_ON_ONCE(!this_cpu_read(in_kernel_fpu)); + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); } @@ -109,7 +109,7 @@ void __kernel_fpu_begin(void) { struct fpu *fpu = ¤t->thread.fpu; - WARN_ON_ONCE(!irq_fpu_usable()); + WARN_ON_FPU(!irq_fpu_usable()); kernel_fpu_disable(); @@ -127,7 +127,7 @@ void __kernel_fpu_end(void) struct fpu *fpu = ¤t->thread.fpu; if (fpu->fpregs_active) { - if (WARN_ON(copy_fpstate_to_fpregs(fpu))) + if (WARN_ON_FPU(copy_fpstate_to_fpregs(fpu))) fpu__clear(fpu); } else { __fpregs_deactivate_hw(); @@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(irq_ts_restore); */ void fpu__save(struct fpu *fpu) { - WARN_ON(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); if (fpu->fpregs_active) { @@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(fpstate_init); */ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - WARN_ON(src_fpu != ¤t->thread.fpu); + WARN_ON_FPU(src_fpu != ¤t->thread.fpu); /* * Don't let 'init optimized' areas of the XSAVE area @@ -284,7 +284,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) */ void fpu__activate_curr(struct fpu *fpu) { - WARN_ON_ONCE(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != ¤t->thread.fpu); if (!fpu->fpstate_active) { fpstate_init(&fpu->state); @@ -321,7 +321,7 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); */ void fpu__activate_stopped(struct fpu *child_fpu) { - WARN_ON_ONCE(child_fpu == ¤t->thread.fpu); + WARN_ON_FPU(child_fpu == ¤t->thread.fpu); if (child_fpu->fpstate_active) { child_fpu->last_cpu = -1; @@ -407,7 +407,7 @@ static inline void copy_init_fpstate_to_fpregs(void) */ void fpu__clear(struct fpu *fpu) { - WARN_ON_ONCE(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ + WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ if (!use_eager_fpu()) { /* FPU state will be reallocated lazily at the first use. */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index a9e506a99a83..e9f1d6e62146 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -143,6 +143,11 @@ EXPORT_SYMBOL_GPL(xstate_size); */ static void __init fpu__init_system_xstate_size_legacy(void) { + static int on_boot_cpu = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + /* * Note that xstate_size might be overwriten later during * fpu__init_system_xstate(). @@ -214,7 +219,12 @@ __setup("eagerfpu=", eager_fpu_setup); */ static void __init fpu__init_system_ctx_switch(void) { - WARN_ON(current->thread.fpu.fpstate_active); + static bool on_boot_cpu = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + + WARN_ON_FPU(current->thread.fpu.fpstate_active); current_thread_info()->status = 0; /* Auto enable eagerfpu for xsaveopt */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 201f08feb259..5724098adf1b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -262,6 +262,11 @@ static void __init setup_xstate_comp(void) */ static void __init setup_init_fpu_buf(void) { + static int on_boot_cpu = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + if (!cpu_has_xsave) return; @@ -317,6 +322,10 @@ static void __init init_xstate_size(void) void __init fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; + static int on_boot_cpu = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; if (!cpu_has_xsave) { pr_info("x86/fpu: Legacy x87 FPU detected.\n"); @@ -324,7 +333,7 @@ void __init fpu__init_system_xstate(void) } if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { - WARN(1, "x86/fpu: XSTATE_CPUID missing!\n"); + WARN_ON_FPU(1); return; } -- cgit v1.2.1 From b1b64dc3558b7bde2917f9fc4f573f6a4787a95c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 5 May 2015 15:56:33 +0200 Subject: x86/fpu: Reorganize fpu/internal.h fpu/internal.h has grown organically, with not much high level structure, which hurts its readability. Organize the various definitions into 5 sections: - high level FPU state functions - FPU/CPU feature flag helpers - fpstate handling functions - FPU context switching helpers - misc helper functions Other related changes: - Move MXCSR_DEFAULT to fpu/types.h. - drop the unused X87_FSW_ES define No change in functionality. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 159 +++++++++++++++++++----------------- arch/x86/include/asm/fpu/types.h | 3 + 2 files changed, 87 insertions(+), 75 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d2a281bd5f45..a98a08d1efa9 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -18,32 +18,6 @@ #include #include -#define MXCSR_DEFAULT 0x1f80 - -extern unsigned int mxcsr_feature_mask; - -extern union fpregs_state init_fpstate; - -extern void fpu__init_cpu(void); -extern void fpu__init_system_xstate(void); -extern void fpu__init_cpu_xstate(void); -extern void fpu__init_system(struct cpuinfo_x86 *c); - -extern void fpstate_init(union fpregs_state *state); -#ifdef CONFIG_MATH_EMULATION -extern void fpstate_init_soft(struct swregs_state *soft); -#else -static inline void fpstate_init_soft(struct swregs_state *soft) {} -#endif -static inline void fpstate_init_fxstate(struct fxregs_state *fx) -{ - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; -} - -extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); -extern int fpu__exception_code(struct fpu *fpu, int trap_nr); - /* * High level FPU state handling functions: */ @@ -55,7 +29,16 @@ extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); extern void fpu__clear(struct fpu *fpu); +extern int fpu__exception_code(struct fpu *fpu, int trap_nr); +extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); +/* + * Boot time FPU initialization functions: + */ +extern void fpu__init_cpu(void); +extern void fpu__init_system_xstate(void); +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); @@ -68,27 +51,9 @@ extern void fpu__resume_cpu(void); # define WARN_ON_FPU(x) ({ 0; }) #endif -DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); - /* - * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, - * on this CPU. - * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. + * FPU related CPU feature flag helper routines: */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) -{ - per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; -} - -static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) -{ - return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; -} - -#define X87_FSW_ES (1 << 7) /* Exception Summary */ - static __always_inline __pure bool use_eager_fpu(void) { return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); @@ -109,6 +74,23 @@ static __always_inline __pure bool use_fxsr(void) return static_cpu_has_safe(X86_FEATURE_FXSR); } +/* + * fpstate handling functions: + */ + +extern union fpregs_state init_fpstate; + +extern void fpstate_init(union fpregs_state *state); +#ifdef CONFIG_MATH_EMULATION +extern void fpstate_init_soft(struct swregs_state *soft); +#else +static inline void fpstate_init_soft(struct swregs_state *soft) {} +#endif +static inline void fpstate_init_fxstate(struct fxregs_state *fx) +{ + fx->cwd = 0x37f; + fx->mxcsr = MXCSR_DEFAULT; +} extern void fpstate_sanitize_xstate(struct fpu *fpu); #define user_insn(insn, output, input...) \ @@ -285,6 +267,32 @@ static inline int copy_fpstate_to_fpregs(struct fpu *fpu) return __copy_fpstate_to_fpregs(fpu); } +extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fx, int size); + +/* + * FPU context switch related helper methods: + */ + +DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +/* + * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, + * on this CPU. + * + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. + */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; +} + +static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + + /* * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' * idiom, which is then paired with the sw-flag (fpregs_active) later on: @@ -354,31 +362,6 @@ static inline void fpregs_deactivate(struct fpu *fpu) __fpregs_deactivate_hw(); } -/* - * Definitions for the eXtended Control Register instructions - */ - -#define XCR_XFEATURE_ENABLED_MASK 0x00000000 - -static inline u64 xgetbv(u32 index) -{ - u32 eax, edx; - - asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ - : "=a" (eax), "=d" (edx) - : "c" (index)); - return eax + ((u64)edx << 32); -} - -static inline void xsetbv(u32 index, u64 value) -{ - u32 eax = value; - u32 edx = value >> 32; - - asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ - : : "a" (eax), "d" (edx), "c" (index)); -} - /* * FPU state switching for scheduling. * @@ -437,6 +420,10 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) return fpu; } +/* + * Misc helper functions: + */ + /* * By the time this gets called, we've already cleared CR0.TS and * given the process the FPU if we are going to preload the FPU @@ -453,11 +440,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switc } } -/* - * Signal frame handlers... - */ -extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fx, int size); - /* * Needs to be preemption-safe. * @@ -476,4 +458,31 @@ static inline void user_fpu_begin(void) preempt_enable(); } +/* + * MXCSR and XCR definitions: + */ + +extern unsigned int mxcsr_feature_mask; + +#define XCR_XFEATURE_ENABLED_MASK 0x00000000 + +static inline u64 xgetbv(u32 index) +{ + u32 eax, edx; + + asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ + : "=a" (eax), "=d" (edx) + : "c" (index)); + return eax + ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ + u32 eax = value; + u32 edx = value >> 32; + + asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ + : : "a" (eax), "d" (edx), "c" (index)); +} + #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 02241c2a10e9..0637826292de 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -65,6 +65,9 @@ struct fxregs_state { } __attribute__((aligned(16))); +/* Default value for fxregs_state.mxcsr: */ +#define MXCSR_DEFAULT 0x1f80 + /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: -- cgit v1.2.1 From a2f1c8bdc02bfcaa5a658283b883fdb54e328b36 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:15 +0800 Subject: x86/irq/msi: Implement irq_set_vcpu_affinity for remapped MSI irqs Implement irq_set_vcpu_affinity for pci_msi_ir_controller. Signed-off-by: Feng Wu Reviewed-by: Jiang Liu Link: http://lkml.kernel.org/r/1432026437-16560-3-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ef516afa20bb..1a9d735e09c6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -152,6 +152,7 @@ static struct irq_chip pci_msi_ir_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, .flags = IRQCHIP_SKIP_SET_WAKE, }; -- cgit v1.2.1 From f6b3c72c23661e5534cd2eede16e9bac7ebb761c Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:16 +0800 Subject: x86/irq: Define a global vector for VT-d Posted-Interrupts Currently, we use a global vector as the Posted-Interrupts Notification Event for all the vCPUs in the system. We need to introduce another global vector for VT-d Posted-Interrtups, which will be used to wakeup the sleep vCPU when an external interrupt from a direct-assigned device happens for that vCPU. [ tglx: Removed a gazillion of extra newlines ] Signed-off-by: Feng Wu Cc: jiang.liu@linux.intel.com Link: http://lkml.kernel.org/r/1432026437-16560-4-git-send-email-feng.wu@intel.com Suggested-by: Yang Zhang Acked-by: H. Peter Anvin Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/entry_arch.h | 2 ++ arch/x86/include/asm/hardirq.h | 1 + arch/x86/include/asm/hw_irq.h | 2 ++ arch/x86/include/asm/irq.h | 4 ++++ arch/x86/include/asm/irq_vectors.h | 1 + arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 26 ++++++++++++++++++++++++++ arch/x86/kernel/irqinit.c | 2 ++ 8 files changed, 40 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index dc5fa661465f..27ca0afcccd7 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_HAVE_KVM BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, smp_kvm_posted_intr_ipi) +BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, + smp_kvm_posted_intr_wakeup_ipi) #endif /* diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 0f5fb6b6567e..986606539395 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,6 +14,7 @@ typedef struct { #endif #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; + unsigned int kvm_posted_intr_wakeup_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9ec5d37d8da3..10c80d4f8386 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,6 +29,7 @@ extern asmlinkage void apic_timer_interrupt(void); extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); @@ -58,6 +59,7 @@ extern void trace_call_function_single_interrupt(void); #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt #define trace_reboot_interrupt reboot_interrupt #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi +#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi #endif /* CONFIG_TRACING */ #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index a80cbb88ea91..8008d06581c7 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -30,6 +30,10 @@ extern void fixup_irqs(void); extern void irq_force_complete_move(int); #endif +#ifdef CONFIG_HAVE_KVM +extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); +#endif + extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index bf55235d7772..0ed29ac13a9d 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -86,6 +86,7 @@ /* Vector for KVM to deliver posted interrupt IPI */ #ifdef CONFIG_HAVE_KVM #define POSTED_INTR_VECTOR 0xf2 +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 #endif /* diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 47b95813dc37..22aadc917868 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -916,6 +916,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR \ kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \ + kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index be3894512820..90b2f7052f5b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -242,6 +242,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs) } #ifdef CONFIG_HAVE_KVM +static void dummy_handler(void) {} +static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; + +void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) +{ + if (handler) + kvm_posted_intr_wakeup_handler = handler; + else + kvm_posted_intr_wakeup_handler = dummy_handler; +} +EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); + /* * Handler for POSTED_INTERRUPT_VECTOR. */ @@ -254,6 +266,20 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); } + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); + inc_irq_stat(kvm_posted_intr_wakeup_ipis); + kvm_posted_intr_wakeup_handler(); + exiting_irq(); + set_irq_regs(old_regs); +} #endif __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index dc1e08d23552..680723a8e4b6 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -144,6 +144,8 @@ static void __init apic_intr_init(void) #ifdef CONFIG_HAVE_KVM /* IPI for KVM to deliver posted interrupt */ alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); + /* IPI for KVM to deliver interrupt to wake up tasks */ + alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); #endif /* IPI vectors for APIC spurious and error interrupts */ -- cgit v1.2.1 From 501b32653ebf49114cccb9afbf9150cf18fd8700 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:17 +0800 Subject: x86/irq: Show statistics information for posted-interrupts Show the statistics information for notification event and wakeup event for posted-interrupt in /proc/interrupts. [ tglx: Named the short identifiers PIN and PIW to match the long identifiers ] Signed-off-by: Feng Wu Cc: jiang.liu@linux.intel.com Link: http://lkml.kernel.org/r/1432026437-16560-5-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 90b2f7052f5b..7e10c8b4b318 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -141,6 +141,18 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif +#ifdef CONFIG_HAVE_KVM + seq_printf(p, "%*s: ", prec, "PIN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); + seq_puts(p, " Posted-interrupt notification event\n"); + + seq_printf(p, "%*s: ", prec, "PIW"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_wakeup_ipis); + seq_puts(p, " Posted-interrupt wakeup event\n"); #endif return 0; } -- cgit v1.2.1 From c546d5db75b452737e554bb9c33dedb46847c4fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2015 08:36:29 +0200 Subject: remove scatterlist.h generation from arch Kbuild files Signed-off-by: Christoph Hellwig Reported-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- arch/x86/include/asm/Kbuild | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index d55a210a49bf..4dd1f2d770af 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -9,4 +9,3 @@ generic-y += cputime.h generic-y += dma-contiguous.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h -generic-y += scatterlist.h -- cgit v1.2.1 From 3db176d5b4170284d9ce1e1e9c441ebfa9a37417 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 19 Apr 2015 21:12:59 +0300 Subject: KVM: x86: Fix DR7 mask on task-switch while debugging If the host sets hardware breakpoints to debug the guest, and a task-switch occurs in the guest, the architectural DR7 will not be updated. The effective DR7 would be updated instead. This fix puts the DR7 update during task-switch emulation, so it now uses the standard DR setting mechanism instead of the one that was previously used. As a bonus, the update of DR7 will now be effective for AMD as well. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 6 +++++- arch/x86/kvm/vmx.c | 3 --- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5839fc56cb3e..b32a38e6e287 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "x86.h" #include "tss.h" @@ -2849,7 +2850,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; - ulong desc_addr; + ulong desc_addr, dr7; /* FIXME: old_tss_base == ~0 ? */ @@ -2934,6 +2935,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ret = em_push(ctxt); } + ops->get_dr(ctxt, 7, &dr7); + ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN)); + return ret; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5e8d41bccc26..bcb61b024386 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5708,9 +5708,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) return 0; } - /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155); - /* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6? -- cgit v1.2.1 From ee122a7109e42313caadf6038ab773d1f68fcce1 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 28 Apr 2015 13:06:00 +0300 Subject: KVM: x86: Fix update RCX/RDI/RSI on REP-string When REP-string instruction is preceded with an address-size prefix, ECX/EDI/ESI are used as the operation counter and pointers. When they are updated, the high 32-bits of RCX/RDI/RSI are cleared, similarly to the way they are updated on every 32-bit register operation. Fix it. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b32a38e6e287..e8c03be83e48 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -524,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc) static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc) { - ulong mask; + ulong *preg = reg_rmw(ctxt, reg); - if (ctxt->ad_bytes == sizeof(unsigned long)) - mask = ~0UL; - else - mask = ad_mask(ctxt); - masked_increment(reg_rmw(ctxt, reg), mask, inc); + assign_register(preg, *preg + inc, ctxt->ad_bytes); } static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) -- cgit v1.2.1 From 428e3d08574b77876ea5e71f294f91bd8afa51b5 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 28 Apr 2015 13:06:01 +0300 Subject: KVM: x86: Fix zero iterations REP-string When a REP-string is executed in 64-bit mode with an address-size prefix, ECX/EDI/ESI are used as counter and pointers. When ECX is initially zero, Intel CPUs clear the high 32-bits of RCX, and recent Intel CPUs update the high bits of the pointers in MOVS/STOS. This behavior is specific to Intel according to few experiments. As one may guess, this is an undocumented behavior. Yet, it is observable in the guest, since at least VMX traps REP-INS/OUTS even when ECX=0. Note that VMware appears to get it right. The behavior can be observed using the following code: #include #define LOW_MASK (0xffffffff00000000ull) #define ALL_MASK (0xffffffffffffffffull) #define TEST(opcode) \ do { \ asm volatile(".byte 0xf2 \n\t .byte 0x67 \n\t .byte " opcode "\n\t" \ : "=S"(s), "=c"(c), "=D"(d) \ : "S"(ALL_MASK), "c"(LOW_MASK), "D"(ALL_MASK)); \ printf("opcode %s rcx=%llx rsi=%llx rdi=%llx\n", \ opcode, c, s, d); \ } while(0) void main() { unsigned long long s, d, c; iopl(3); TEST("0x6c"); TEST("0x6d"); TEST("0x6e"); TEST("0x6f"); TEST("0xa4"); TEST("0xa5"); TEST("0xa6"); TEST("0xa7"); TEST("0xaa"); TEST("0xab"); TEST("0xae"); TEST("0xaf"); } Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e8c03be83e48..9b655d113fc6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2570,6 +2570,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, return true; } +static void string_registers_quirk(struct x86_emulate_ctxt *ctxt) +{ + /* + * Intel CPUs mask the counter and pointers in quite strange + * manner when ECX is zero due to REP-string optimizations. + */ +#ifdef CONFIG_X86_64 + if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt)) + return; + + *reg_write(ctxt, VCPU_REGS_RCX) = 0; + + switch (ctxt->b) { + case 0xa4: /* movsb */ + case 0xa5: /* movsd/w */ + *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1; + /* fall through */ + case 0xaa: /* stosb */ + case 0xab: /* stosd/w */ + *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1; + } +#endif +} + static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { @@ -4910,6 +4934,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if (ctxt->rep_prefix && (ctxt->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { + string_registers_quirk(ctxt); ctxt->eip = ctxt->_eip; ctxt->eflags &= ~X86_EFLAGS_RF; goto done; -- cgit v1.2.1 From edc90b7dc4ceef62ef0ad9cc6c3f5dc770e83ad2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 11 May 2015 22:55:21 +0800 Subject: KVM: MMU: fix SMAP virtualization KVM may turn a user page to a kernel page when kernel writes a readonly user page if CR0.WP = 1. This shadow page entry will be reused after SMAP is enabled so that kernel is allowed to access this user page Fix it by setting SMAP && !CR0.WP into shadow page's role and reset mmu once CR4.SMAP is updated Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 16 ++++++++++++---- arch/x86/kvm/mmu.h | 2 -- arch/x86/kvm/x86.c | 8 +++----- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8b661d1946b5..bbb8f4e7738a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -207,6 +207,7 @@ union kvm_mmu_page_role { unsigned nxe:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; + unsigned smap_andnot_wp:1; }; }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 209fe1477465..44a7d2515497 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3736,8 +3736,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -void update_permission_bitmask(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, bool ept) +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3918,6 +3918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); struct kvm_mmu *context = &vcpu->arch.mmu; MMU_WARN_ON(VALID_PAGE(context->root_hpa)); @@ -3936,6 +3937,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.cr0_wp = is_write_protection(vcpu); context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); + context->base_role.smap_andnot_wp + = smap && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -4207,12 +4210,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; - union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush, zap_page; + union kvm_mmu_page_role mask = (union kvm_mmu_page_role) { + .cr0_wp = 1, + .cr4_pae = 1, + .nxe = 1, + .smep_andnot_wp = 1, + .smap_andnot_wp = 1, + }; /* * If we don't have indirect shadow pages, it means no page is @@ -4238,7 +4247,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - mask.cr0_wp = mask.cr4_pae = mask.nxe = mask.smep_andnot_wp = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 06eb2fc1bab8..0ada65ecddcf 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,8 +71,6 @@ enum { int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); -void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cdccbe1749a5..cde5d614ff0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -702,8 +702,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | - X86_CR4_PAE | X86_CR4_SMEP; + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP; + if (cr4 & CR4_RESERVED_BITS) return 1; @@ -744,9 +745,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_SMAP) - update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); -- cgit v1.2.1 From c35ebbeade127e7bca1f21ef5bf1a39deffba9de Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 May 2015 15:59:26 +0200 Subject: Revert "kvmclock: set scheduler clock stable" This reverts commit ff7bbb9c6ab6e6620429daeff39424bbde1a94b4. Sasha Levin is seeing odd jump in time values during boot of a KVM guest: [...] [ 0.000000] tsc: Detected 2260.998 MHz processor [3376355.247558] Calibrating delay loop (skipped) preset value.. [...] and bisected them to this commit. Reported-by: Sasha Levin Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 4e03921761c4..42caaef897c8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -266,8 +265,6 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - - set_sched_clock_stable(); } int __init kvm_setup_vsyscall_timeinfo(void) -- cgit v1.2.1 From 0d5367900a319ab8971817b0ca15a8b9f7c47e6f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:20 +0800 Subject: KVM: MMU: introduce for_each_rmap_spte() It's used to walk all the sptes on the rmap to clean up the code Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 53 ++++++++++++++++-------------------------------- arch/x86/kvm/mmu_audit.c | 4 +--- 2 files changed, 19 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 44a7d2515497..7a1158533f89 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1142,6 +1142,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) return NULL; } +#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \ + for (_spte_ = rmap_get_first(*_rmap_, _iter_); \ + _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \ + _spte_ = rmap_get_next(_iter_)) + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) @@ -1205,12 +1210,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_write_protect(kvm, sptep, pt_protect); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1232,12 +1233,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_clear_dirty(kvm, sptep); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1259,12 +1256,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_set_dirty(kvm, sptep); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1394,8 +1387,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!is_shadow_present_pte(*sptep)); +restart: + for_each_rmap_spte(rmapp, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); @@ -1403,7 +1396,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, if (pte_write(*ptep)) { drop_spte(kvm, sptep); - sptep = rmap_get_first(*rmapp, &iter); + goto restart; } else { new_spte = *sptep & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; @@ -1414,7 +1407,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); - sptep = rmap_get_next(&iter); } } @@ -1518,16 +1510,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, BUG_ON(!shadow_accessed_mask); - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { - BUG_ON(!is_shadow_present_pte(*sptep)); - + for_each_rmap_spte(rmapp, &iter, sptep) if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } - } + trace_kvm_age_page(gfn, level, slot, young); return young; } @@ -1548,15 +1537,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { - BUG_ON(!is_shadow_present_pte(*sptep)); - + for_each_rmap_spte(rmapp, &iter, sptep) if (*sptep & shadow_accessed_mask) { young = 1; break; } - } out: return young; } @@ -4482,9 +4467,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, pfn_t pfn; struct kvm_mmu_page *sp; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - +restart: + for_each_rmap_spte(rmapp, &iter, sptep) { sp = page_header(__pa(sptep)); pfn = spte_to_pfn(*sptep); @@ -4499,10 +4483,9 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, !kvm_is_reserved_pfn(pfn) && PageTransCompound(pfn_to_page(pfn))) { drop_spte(kvm, sptep); - sptep = rmap_get_first(*rmapp, &iter); need_tlb_flush = 1; - } else - sptep = rmap_get_next(&iter); + goto restart; + } } return need_tlb_flush; diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 9ade5cfb5a4c..368d53497314 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -197,13 +197,11 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { + for_each_rmap_spte(rmapp, &iter, sptep) if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - } } static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -- cgit v1.2.1 From 8a3d08f16fc63400f637dfa69aa5c7ea016ee18a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:21 +0800 Subject: KVM: MMU: introduce PT_MAX_HUGEPAGE_LEVEL Suggested-by: Paolo Bonzini Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 24 +++++++++--------------- arch/x86/kvm/mmu.h | 1 + 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7a1158533f89..b1f5f09e0c29 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -811,8 +811,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) int i; slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count += 1; } @@ -826,8 +825,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) int i; slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count -= 1; WARN_ON(linfo->write_count < 0); @@ -858,8 +856,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) page_size = kvm_host_page_size(kvm, gfn); - for (i = PT_PAGE_TABLE_LEVEL; - i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { if (page_size >= KVM_HPAGE_SIZE(i)) ret = i; else @@ -1344,8 +1341,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) slot = gfn_to_memslot(kvm, gfn); - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmapp, true); } @@ -1451,7 +1447,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); for (j = PT_PAGE_TABLE_LEVEL; - j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { + j <= PT_MAX_HUGEPAGE_LEVEL; ++j) { unsigned long idx, idx_end; unsigned long *rmapp; gfn_t gfn = gfn_start; @@ -4416,8 +4412,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { unsigned long *rmapp; unsigned long last_index, index; @@ -4573,8 +4568,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); - for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + /* skip rmap for 4K page */ + for (i = PT_PAGE_TABLE_LEVEL + 1; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { unsigned long *rmapp; unsigned long last_index, index; @@ -4611,8 +4606,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { unsigned long *rmapp; unsigned long last_index, index; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 0ada65ecddcf..72bb33f70c16 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -43,6 +43,7 @@ #define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 +#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) static inline u64 rsvd_bits(int s, int e) { -- cgit v1.2.1 From 6ce1f4e295dd06b19443341b1684d361739ec117 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:22 +0800 Subject: KVM: MMU: introduce for_each_slot_rmap_range It's used to abstract the code from kvm_handle_hva_range and it will be used by later patch Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 97 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1f5f09e0c29..316c43243995 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1412,6 +1412,74 @@ restart: return 0; } +struct slot_rmap_walk_iterator { + /* input fields. */ + struct kvm_memory_slot *slot; + gfn_t start_gfn; + gfn_t end_gfn; + int start_level; + int end_level; + + /* output fields. */ + gfn_t gfn; + unsigned long *rmap; + int level; + + /* private field. */ + unsigned long *end_rmap; +}; + +static void +rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) +{ + iterator->level = level; + iterator->gfn = iterator->start_gfn; + iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, + iterator->slot); +} + +static void +slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, + struct kvm_memory_slot *slot, int start_level, + int end_level, gfn_t start_gfn, gfn_t end_gfn) +{ + iterator->slot = slot; + iterator->start_level = start_level; + iterator->end_level = end_level; + iterator->start_gfn = start_gfn; + iterator->end_gfn = end_gfn; + + rmap_walk_init_level(iterator, iterator->start_level); +} + +static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) +{ + return !!iterator->rmap; +} + +static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) +{ + if (++iterator->rmap <= iterator->end_rmap) { + iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + return; + } + + if (++iterator->level > iterator->end_level) { + iterator->rmap = NULL; + return; + } + + rmap_walk_init_level(iterator, iterator->level); +} + +#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ + _start_gfn, _end_gfn, _iter_) \ + for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ + _end_level_, _start_gfn, _end_gfn); \ + slot_rmap_walk_okay(_iter_); \ + slot_rmap_walk_next(_iter_)) + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, @@ -1423,10 +1491,10 @@ static int kvm_handle_hva_range(struct kvm *kvm, int level, unsigned long data)) { - int j; - int ret = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + struct slot_rmap_walk_iterator iterator; + int ret = 0; slots = kvm_memslots(kvm); @@ -1446,26 +1514,11 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_start = hva_to_gfn_memslot(hva_start, memslot); gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - for (j = PT_PAGE_TABLE_LEVEL; - j <= PT_MAX_HUGEPAGE_LEVEL; ++j) { - unsigned long idx, idx_end; - unsigned long *rmapp; - gfn_t gfn = gfn_start; - - /* - * {idx(page_j) | page_j intersects with - * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}. - */ - idx = gfn_to_index(gfn_start, memslot->base_gfn, j); - idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j); - - rmapp = __gfn_to_rmap(gfn_start, j, memslot); - - for (; idx <= idx_end; - ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j))) - ret |= handler(kvm, rmapp++, memslot, - gfn, j, data); - } + for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1, + &iterator) + ret |= handler(kvm, iterator.rmap, memslot, + iterator.gfn, iterator.level, data); } return ret; -- cgit v1.2.1 From 1bad2b2a3b158fbb19fef6cd563301b94b5c28b2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:23 +0800 Subject: KVM: MMU: introduce slot_handle_level_range() and its helpers There are several places walking all rmaps for the memslot so that introduce common functions to cleanup the code Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 316c43243995..d1d072d70b7b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4454,6 +4454,75 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap); + +/* The caller should hold mmu-lock before calling this function. */ +static bool +slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) +{ + struct slot_rmap_walk_iterator iterator; + bool flush = false; + + for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } + + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + + return flush; +} + +static bool +slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +{ + return slot_handle_level_range(kvm, memslot, fn, start_level, + end_level, memslot->base_gfn, + memslot->base_gfn + memslot->npages - 1, + lock_flush_tlb); +} + +static bool +slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static bool +slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static bool +slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); +} + void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) { -- cgit v1.2.1 From d77aa73c7072c598a6d1f3a781c0e4fae067df76 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:24 +0800 Subject: KVM: MMU: use slot_handle_level and its helper to clean up the code slot_handle_level and its helper functions are ready now, use them to clean up the code Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 128 +++++++---------------------------------------------- 1 file changed, 16 insertions(+), 112 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d1d072d70b7b..ed239c696056 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4523,34 +4523,19 @@ slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, PT_PAGE_TABLE_LEVEL, lock_flush_tlb); } +static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) +{ + return __rmap_write_protect(kvm, rmapp, false); +} + void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_write_protect(kvm, rmapp, - false); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } - + flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, + false); spin_unlock(&kvm->mmu_lock); /* @@ -4611,59 +4596,18 @@ restart: void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush = false; - unsigned long *rmapp; - unsigned long last_index, index; - spin_lock(&kvm->mmu_lock); - - rmapp = memslot->arch.rmap[0]; - last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, - memslot->base_gfn, PT_PAGE_TABLE_LEVEL); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - cond_resched_lock(&kvm->mmu_lock); - } - } - - if (flush) - kvm_flush_remote_tlbs(kvm); - + slot_handle_leaf(kvm, memslot, kvm_mmu_zap_collapsible_spte, true); spin_unlock(&kvm->mmu_lock); } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - unsigned long *rmapp; - unsigned long last_index, index; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, - PT_PAGE_TABLE_LEVEL); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_clear_dirty(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); @@ -4682,31 +4626,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - /* skip rmap for 4K page */ - for (i = PT_PAGE_TABLE_LEVEL + 1; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_write_protect(kvm, rmapp, - false); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } + flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, + false); spin_unlock(&kvm->mmu_lock); /* see kvm_mmu_slot_remove_write_access */ @@ -4720,30 +4644,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_set_dirty(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } - + flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); -- cgit v1.2.1 From 6a49f85c7ac83c1918d138d40492a5cef40b5ff8 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:25 +0800 Subject: KVM: MMU: introduce kvm_zap_rmapp Split kvm_unmap_rmapp and introduce kvm_zap_rmapp which will be used in the later patch Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ed239c696056..ddf25f39735a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1349,24 +1349,28 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) { u64 *sptep; struct rmap_iterator iter; - int need_tlb_flush = 0; + bool flush = false; while ((sptep = rmap_get_first(*rmapp, &iter))) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); - need_tlb_flush = 1; + flush = true; } - return need_tlb_flush; + return flush; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) +{ + return kvm_zap_rmapp(kvm, rmapp); } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, -- cgit v1.2.1 From d69afbc6b1b5d0579f13d1a6339d952c4f60a9f4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:19 +0800 Subject: KVM: MMU: fix decoding cache type from MTRR There are some bugs in current get_mtrr_type(); 1: bit 1 of mtrr_state->enabled is corresponding bit 11 of IA32_MTRR_DEF_TYPE MSR which completely control MTRR's enablement that means other bits are ignored if it is cleared 2: the fixed MTRR ranges are controlled by bit 0 of mtrr_state->enabled (bit 10 of IA32_MTRR_DEF_TYPE) 3: if MTRR is disabled, UC is applied to all of physical memory rather than mtrr_state->def_type Signed-off-by: Xiao Guangrong Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ddf25f39735a..e718c76609f9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2431,19 +2431,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); static int get_mtrr_type(struct mtrr_state_type *mtrr_state, u64 start, u64 end) { - int i; u64 base, mask; u8 prev_match, curr_match; - int num_var_ranges = KVM_NR_VAR_MTRR; + int i, num_var_ranges = KVM_NR_VAR_MTRR; - if (!mtrr_state->enabled) - return 0xFF; + /* MTRR is completely disabled, use UC for all of physical memory. */ + if (!(mtrr_state->enabled & 0x2)) + return MTRR_TYPE_UNCACHABLE; /* Make end inclusive end, instead of exclusive */ end--; /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state->have_fixed && (start < 0x100000)) { + if (mtrr_state->have_fixed && (mtrr_state->enabled & 0x1) && + (start < 0x100000)) { int idx; if (start < 0x80000) { @@ -2466,9 +2467,6 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state, * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state->enabled & 2)) - return mtrr_state->def_type; - prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state; -- cgit v1.2.1 From efdfe536d8c643391e19d5726b072f82964bfbdb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:27 +0800 Subject: KVM: MMU: fix MTRR update Currently, whenever guest MTRR registers are changed kvm_mmu_reset_context is called to switch to the new root shadow page table, however, it's useless since: 1) the cache type is not cached into shadow page's attribute so that the original root shadow page will be reused 2) the cache type is set on the last spte, that means we should sync the last sptes when MTRR is changed This patch fixs this issue by drop all the spte in the gfn range which is being updated by MTRR Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 24 ++++++++++++++++++++++ arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 83 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e718c76609f9..5bebec1191f1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4525,6 +4525,30 @@ slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, PT_PAGE_TABLE_LEVEL, lock_flush_tlb); } +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + + spin_lock(&kvm->mmu_lock); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; + + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); + } + + spin_unlock(&kvm->mmu_lock); +} + static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) { return __rmap_write_protect(kvm, rmapp, false); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 72bb33f70c16..398d21c0f6dd 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -171,4 +171,5 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cde5d614ff0a..bbe184f07bf9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1852,6 +1852,63 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } EXPORT_SYMBOL_GPL(kvm_mtrr_valid); +static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state; + unsigned char mtrr_enabled = mtrr_state->enabled; + gfn_t start, end, mask; + int index; + bool is_fixed = true; + + if (msr == MSR_IA32_CR_PAT || !tdp_enabled || + !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return; + + if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType) + return; + + switch (msr) { + case MSR_MTRRfix64K_00000: + start = 0x0; + end = 0x80000; + break; + case MSR_MTRRfix16K_80000: + start = 0x80000; + end = 0xa0000; + break; + case MSR_MTRRfix16K_A0000: + start = 0xa0000; + end = 0xc0000; + break; + case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: + index = msr - MSR_MTRRfix4K_C0000; + start = 0xc0000 + index * (32 << 10); + end = start + (32 << 10); + break; + case MSR_MTRRdefType: + is_fixed = false; + start = 0x0; + end = ~0ULL; + break; + default: + /* variable range MTRRs. */ + is_fixed = false; + index = (msr - 0x200) / 2; + start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) + + (mtrr_state->var_ranges[index].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) + + (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK); + mask |= ~0ULL << cpuid_maxphyaddr(vcpu); + + end = ((start & mask) | ~mask) + 1; + } + + if (is_fixed && !(mtrr_enabled & 0x1)) + return; + + kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); +} + static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; @@ -1885,7 +1942,7 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) *pt = data; } - kvm_mmu_reset_context(vcpu); + update_mtrr(vcpu, msr); return 0; } -- cgit v1.2.1 From d81135a57aa66e5bec55cdfe4dda79633f9082dd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 May 2015 14:42:28 +0800 Subject: KVM: x86: do not reset mmu if CR0.CD and CR0.NW are changed CR0.CD and CR0.NW are not used by shadow page table so that need not adjust mmu if these two bit are changed Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bbe184f07bf9..457b908244f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -572,8 +572,7 @@ out: int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | - X86_CR0_CD | X86_CR0_NW; + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; -- cgit v1.2.1 From 3520469d65f26a1cd2f610f5d5de976f78db74fe Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 2 Apr 2015 11:20:48 +0200 Subject: KVM: export __gfn_to_pfn_memslot, drop gfn_to_pfn_async gfn_to_pfn_async is used in just one place, and because of x86-specific treatment that place will need to look at the memory slot. Hence inline it into try_async_pf and export __gfn_to_pfn_memslot. The patch also switches the subsequent call to gfn_to_pfn_prot to use __gfn_to_pfn_memslot. This is a small optimization. Finally, remove the now-unused async argument of __gfn_to_pfn. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5bebec1191f1..49c34e632b91 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3511,10 +3511,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable) { + struct kvm_memory_slot *slot; bool async; - *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); - + slot = gfn_to_memslot(vcpu->kvm, gfn); + async = false; + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) return false; /* *pfn has correct page already */ @@ -3528,8 +3530,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); - + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); return false; } -- cgit v1.2.1 From b11ca7fbc735aca5ab78ce8130603bbf4fefd3f7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 May 2015 09:59:30 +0200 Subject: x86/fpu/xstate: Use explicit parameter in xstate_fault() While looking at xstate.h it took me some time to realize that 'xstate_fault' uses 'err' as a silent parameter. This is not obvious at the call site, at all. Make it an explicit macro argument, so that the syntactic connection is easier to see. Also explain xstate_fault() a bit. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 8f336d2ae126..a6e07bb31526 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -47,12 +47,18 @@ extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" -#define xstate_fault ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err) +/* xstate instruction fault handler: */ +#define xstate_fault(__err) \ + \ + ".section .fixup,\"ax\"\n" \ + \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + \ + ".previous\n" \ + \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (__err) /* * This function is called only during boot time when x86 caps are not set @@ -70,13 +76,13 @@ static inline int copy_xregs_to_kernel_booting(struct xregs_state *fx) if (boot_cpu_has(X86_FEATURE_XSAVES)) asm volatile("1:"XSAVES"\n\t" "2:\n\t" - xstate_fault + xstate_fault(err) : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XSAVE"\n\t" "2:\n\t" - xstate_fault + xstate_fault(err) : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); return err; @@ -97,13 +103,13 @@ static inline int copy_kernel_to_xregs_booting(struct xregs_state *fx, u64 mask) if (boot_cpu_has(X86_FEATURE_XSAVES)) asm volatile("1:"XRSTORS"\n\t" "2:\n\t" - xstate_fault + xstate_fault(err) : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XRSTOR"\n\t" "2:\n\t" - xstate_fault + xstate_fault(err) : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); return err; @@ -141,7 +147,7 @@ static inline int copy_xregs_to_kernel(struct xregs_state *fx) [fx] "D" (fx), "a" (lmask), "d" (hmask) : "memory"); asm volatile("2:\n\t" - xstate_fault + xstate_fault(err) : "0" (0) : "memory"); @@ -169,7 +175,7 @@ static inline int copy_kernel_to_xregs(struct xregs_state *fx, u64 mask) : "memory"); asm volatile("2:\n" - xstate_fault + xstate_fault(err) : "0" (0) : "memory"); @@ -201,7 +207,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf) __asm__ __volatile__(ASM_STAC "\n" "1:"XSAVE"\n" "2: " ASM_CLAC "\n" - xstate_fault + xstate_fault(err) : "D" (buf), "a" (-1), "d" (-1), "0" (0) : "memory"); return err; @@ -220,7 +226,7 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) __asm__ __volatile__(ASM_STAC "\n" "1:"XRSTOR"\n" "2: " ASM_CLAC "\n" - xstate_fault + xstate_fault(err) : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) : "memory"); /* memory required? */ return err; -- cgit v1.2.1 From 7cf82d33b613780a79fda91babf7b9e6c5c82d75 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 May 2015 11:35:02 +0200 Subject: x86/fpu/init: Move __setup() functions to fpu/init.c We had a number of FPU init related boot option handlers in arch/x86/kernel/cpu/common.c - move them over into arch/x86/kernel/fpu/init.c to have them all in a single place. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 36 ------------------------------------ arch/x86/kernel/fpu/init.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d6fe512441e5..401ccb03054e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -144,42 +144,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); -static int __init x86_xsave_setup(char *s) -{ - if (strlen(s)) - return 0; - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - return 1; -} -__setup("noxsave", x86_xsave_setup); - -static int __init x86_xsaveopt_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - return 1; -} -__setup("noxsaveopt", x86_xsaveopt_setup); - -static int __init x86_xsaves_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - return 1; -} -__setup("noxsaves", x86_xsaves_setup); - -static int __init x86_fxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); - setup_clear_cpu_cap(X86_FEATURE_XMM); - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index e9f1d6e62146..fe8cce7fc5ea 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -286,3 +286,40 @@ static int __init no_387(char *s) } __setup("no387", no_387); + +static int __init x86_xsave_setup(char *s) +{ + if (strlen(s)) + return 0; + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); + return 1; +} +__setup("noxsave", x86_xsave_setup); + +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + +static int __init x86_xsaves_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + return 1; +} +__setup("noxsaves", x86_xsaves_setup); + +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + -- cgit v1.2.1 From 5856afed0c8c419286d9f0c8e57e83e2875eec4b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 May 2015 11:39:35 +0200 Subject: x86/fpu/init: Clean up and comment the __setup() functions Explain the functions and also standardize their style and naming. No change in functionality. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index fe8cce7fc5ea..9da740e236d5 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -284,42 +284,57 @@ static int __init no_387(char *s) setup_clear_cpu_cap(X86_FEATURE_FPU); return 1; } - __setup("no387", no_387); -static int __init x86_xsave_setup(char *s) +/* + * Disable all xstate CPU features: + */ +static int __init x86_noxsave_setup(char *s) { if (strlen(s)) return 0; + setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); setup_clear_cpu_cap(X86_FEATURE_XSAVES); setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); + return 1; } -__setup("noxsave", x86_xsave_setup); +__setup("noxsave", x86_noxsave_setup); -static int __init x86_xsaveopt_setup(char *s) +/* + * Disable the XSAVEOPT instruction specifically: + */ +static int __init x86_noxsaveopt_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; } -__setup("noxsaveopt", x86_xsaveopt_setup); +__setup("noxsaveopt", x86_noxsaveopt_setup); -static int __init x86_xsaves_setup(char *s) +/* + * Disable the XSAVES instruction: + */ +static int __init x86_noxsaves_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVES); + return 1; } -__setup("noxsaves", x86_xsaves_setup); +__setup("noxsaves", x86_noxsaves_setup); -static int __init x86_fxsr_setup(char *s) +/* + * Disable FX save/restore and SSE support: + */ +static int __init x86_nofxsr_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_FXSR); setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; } -__setup("nofxsr", x86_fxsr_setup); - +__setup("nofxsr", x86_nofxsr_setup); -- cgit v1.2.1 From ed3cf15271fa150320b46f03d369777b71786298 Mon Sep 17 00:00:00 2001 From: Nicholas Krause Date: Wed, 20 May 2015 00:24:10 -0400 Subject: kvm: x86: Make functions that have no external callers static This makes the functions kvm_guest_cpu_init and kvm_init_debugfs static now due to having no external callers outside their declarations in the file, kvm.c. Signed-off-by: Nicholas Krause Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9435620062df..cc34cece853e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -331,7 +331,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) apic_write(APIC_EOI, APIC_EOI_ACK); } -void kvm_guest_cpu_init(void) +static void kvm_guest_cpu_init(void) { if (!kvm_para_available()) return; @@ -655,7 +655,7 @@ static inline void spin_time_accum_blocked(u64 start) static struct dentry *d_spin_debug; static struct dentry *d_kvm_debug; -struct dentry *kvm_init_debugfs(void) +static struct dentry *kvm_init_debugfs(void) { d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); if (!d_kvm_debug) -- cgit v1.2.1 From b52f00e6a7154308a08d0a2edab621f277801a2c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 19 May 2015 16:59:04 -0700 Subject: x86: bpf_jit: implement bpf_tail_call() helper bpf_tail_call() arguments: ctx - context pointer jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table index - index in the jump table In this implementation x64 JIT bypasses stack unwind and jumps into the callee program after prologue, so the callee program reuses the same stack. The logic can be roughly expressed in C like: u32 tail_call_cnt; void *jumptable[2] = { &&label1, &&label2 }; int bpf_prog1(void *ctx) { label1: ... } int bpf_prog2(void *ctx) { label2: ... } int bpf_prog1(void *ctx) { ... if (tail_call_cnt++ < MAX_TAIL_CALL_CNT) goto *jumptable[index]; ... and pass my 'ctx' to callee ... ... fall through if no entry in jumptable ... } Note that 'skip current program epilogue and next program prologue' is an optimization. Other JITs don't have to do it the same way. >From safety point of view it's valid as well, since programs always initialize the stack before use, so any residue in the stack left by the current program is not going be read. The same verifier checks are done for the calls from the kernel into all bpf programs. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 150 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 126 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99f76103c6b7..2ca777635d8e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include #include #include +#include int bpf_jit_enable __read_mostly; @@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) return ptr + len; } -#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0) +#define EMIT(bytes, len) \ + do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) #define EMIT1(b1) EMIT(b1, 1) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) @@ -186,31 +188,31 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - int oldproglen, struct jit_context *ctx) +#define STACKSIZE \ + (MAX_BPF_STACK + \ + 32 /* space for rbx, r13, r14, r15 */ + \ + 8 /* space for skb_copy_bits() buffer */) + +#define PROLOGUE_SIZE 51 + +/* emit x64 prologue code for BPF program and check it's size. + * bpf_tail_call helper will skip it while jumping into another program + */ +static void emit_prologue(u8 **pprog) { - struct bpf_insn *insn = bpf_prog->insnsi; - int insn_cnt = bpf_prog->len; - bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); - bool seen_exit = false; - u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i; - int proglen = 0; - u8 *prog = temp; - int stacksize = MAX_BPF_STACK + - 32 /* space for rbx, r13, r14, r15 */ + - 8 /* space for skb_copy_bits() buffer */; + u8 *prog = *pprog; + int cnt = 0; EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ - /* sub rsp, stacksize */ - EMIT3_off32(0x48, 0x81, 0xEC, stacksize); + /* sub rsp, STACKSIZE */ + EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE); /* all classic BPF filters use R6(rbx) save it */ /* mov qword ptr [rbp-X],rbx */ - EMIT3_off32(0x48, 0x89, 0x9D, -stacksize); + EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE); /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and @@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, */ /* mov qword ptr [rbp-X],r13 */ - EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8); + EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8); /* mov qword ptr [rbp-X],r14 */ - EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16); + EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16); /* mov qword ptr [rbp-X],r15 */ - EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24); + EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); /* clear A and X registers */ EMIT2(0x31, 0xc0); /* xor eax, eax */ EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ + /* clear tail_cnt: mov qword ptr [rbp-X], rax */ + EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); + + BUILD_BUG_ON(cnt != PROLOGUE_SIZE); + *pprog = prog; +} + +/* generate the following code: + * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... + * if (index >= array->map.max_entries) + * goto out; + * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + * prog = array->prog[index]; + * if (prog == NULL) + * goto out; + * goto *(prog->bpf_func + prologue_size); + * out: + */ +static void emit_bpf_tail_call(u8 **pprog) +{ + u8 *prog = *pprog; + int label1, label2, label3; + int cnt = 0; + + /* rdi - pointer to ctx + * rsi - pointer to bpf_array + * rdx - index in bpf_array + */ + + /* if (index >= array->map.max_entries) + * goto out; + */ + EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + offsetof(struct bpf_array, map.max_entries)); + EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ +#define OFFSET1 44 /* number of bytes to jump */ + EMIT2(X86_JBE, OFFSET1); /* jbe out */ + label1 = cnt; + + /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ +#define OFFSET2 33 + EMIT2(X86_JA, OFFSET2); /* ja out */ + label2 = cnt; + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ + + /* prog = array->prog[index]; */ + EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */ + EMIT1(offsetof(struct bpf_array, prog)); + EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ + + /* if (prog == NULL) + * goto out; + */ + EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ +#define OFFSET3 10 + EMIT2(X86_JE, OFFSET3); /* je out */ + label3 = cnt; + + /* goto *(prog->bpf_func + prologue_size); */ + EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */ + offsetof(struct bpf_prog, bpf_func)); + EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ + + /* now we're ready to jump into next BPF program + * rdi == ctx (1st arg) + * rax == prog->bpf_func + prologue_size + */ + EMIT2(0xFF, 0xE0); /* jmp rax */ + + /* out: */ + BUILD_BUG_ON(cnt - label1 != OFFSET1); + BUILD_BUG_ON(cnt - label2 != OFFSET2); + BUILD_BUG_ON(cnt - label3 != OFFSET3); + *pprog = prog; +} + +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + int oldproglen, struct jit_context *ctx) +{ + struct bpf_insn *insn = bpf_prog->insnsi; + int insn_cnt = bpf_prog->len; + bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + bool seen_exit = false; + u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + int i, cnt = 0; + int proglen = 0; + u8 *prog = temp; + + emit_prologue(&prog); + if (seen_ld_abs) { /* r9d : skb->len - skb->data_len (headlen) * r10 : skb->data @@ -739,6 +837,10 @@ xadd: if (is_imm8(insn->off)) } break; + case BPF_JMP | BPF_CALL | BPF_X: + emit_bpf_tail_call(&prog); + break; + /* cond jump */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: @@ -891,13 +993,13 @@ common_load: /* update cleanup_addr */ ctx->cleanup_addr = proglen; /* mov rbx, qword ptr [rbp-X] */ - EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize); + EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE); /* mov r13, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8); + EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8); /* mov r14, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16); + EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16); /* mov r15, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24); + EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ -- cgit v1.2.1 From b54b4bbbf5e9313acc681129ab332e33a397cd13 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 May 2015 10:58:45 +0200 Subject: x86/fpu, crypto: Fix AVX2 feature tests For some CPU models I broke the AVX2 feature detection in: 7bc371faa9cd ("x86/fpu, crypto x86/camellia_aesni_avx2: Simplify the camellia_aesni_init() xfeature checks") 534ff06e3929 ("x86/fpu, crypto x86/serpent_avx2: Simplify the init() xfeature checks") ... because I did not realize that it's possible for a CPU to support the xstate necessary for AVX2 execution (XSTATE_YMM), but not have the AVX2 instructions themselves. Restore the necessary CPUID checks as well. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 5 +++++ arch/x86/crypto/serpent_avx2_glue.c | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 76ea7df217e6..4c65c70e628b 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -562,6 +562,11 @@ static int __init camellia_aesni_init(void) { const char *feature_name; + if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + pr_info("AVX2 or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index f226ad41fde1..7d838dc4d888 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -538,6 +538,10 @@ static int __init init(void) { const char *feature_name; + if (!cpu_has_avx2 || !cpu_has_osxsave) { + pr_info("AVX2 instructions are not detected.\n"); + return -ENODEV; + } if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; -- cgit v1.2.1 From cdeb6048940fa4bfb429e2f1cba0d28a11e20cd5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 22 May 2015 16:15:47 -0700 Subject: x86/asm/irq: Stop relying on magic JMP behavior for early_idt_handlers The early_idt_handlers asm code generates an array of entry points spaced nine bytes apart. It's not really clear from that code or from the places that reference it what's going on, and the code only works in the first place because GAS never generates two-byte JMP instructions when jumping to global labels. Clean up the code to generate the correct array stride (member size) explicitly. This should be considerably more robust against screw-ups, as GAS will warn if a .fill directive has a negative count. Using '. =' to advance would have been even more robust (it would generate an actual error if it tried to move backwards), but it would pad with nulls, confusing anyone who tries to disassemble the code. The new scheme should be much clearer to future readers. While we're at it, improve the comments and rename the array and common code. Binutils may start relaxing jumps to non-weak labels. If so, this change will fix our build, and we may need to backport this change. Before, on x86_64: 0000000000000000 : 0: 6a 00 pushq $0x0 2: 6a 00 pushq $0x0 4: e9 00 00 00 00 jmpq 9 5: R_X86_64_PC32 early_idt_handler-0x4 ... 48: 66 90 xchg %ax,%ax 4a: 6a 08 pushq $0x8 4c: e9 00 00 00 00 jmpq 51 4d: R_X86_64_PC32 early_idt_handler-0x4 ... 117: 6a 00 pushq $0x0 119: 6a 1f pushq $0x1f 11b: e9 00 00 00 00 jmpq 120 11c: R_X86_64_PC32 early_idt_handler-0x4 After: 0000000000000000 : 0: 6a 00 pushq $0x0 2: 6a 00 pushq $0x0 4: e9 14 01 00 00 jmpq 11d ... 48: 6a 08 pushq $0x8 4a: e9 d1 00 00 00 jmpq 120 4f: cc int3 50: cc int3 ... 117: 6a 00 pushq $0x0 119: 6a 1f pushq $0x1f 11b: eb 03 jmp 120 11d: cc int3 11e: cc int3 11f: cc int3 Signed-off-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: Binutils Cc: Borislav Petkov Cc: H.J. Lu Cc: Jan Beulich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ac027962af343b0c599cbfcf50b945ad2ef3d7a8.1432336324.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/segment.h | 14 ++++++++++++-- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/head_32.S | 33 ++++++++++++++++++--------------- arch/x86/kernel/head_64.S | 20 +++++++++++--------- 4 files changed, 42 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5a9856eb12ba..7d5a1929d76b 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -231,11 +231,21 @@ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ + +/* + * early_idt_handler_array is an array of entry points referenced in the + * early IDT. For simplicity, it's a real array with one entry point + * every nine bytes. That leaves room for an optional 'push $0' if the + * vector has no error code (two bytes), a 'push $vector_number' (two + * bytes), and a jump to the common entry code (up to five bytes). + */ +#define EARLY_IDT_HANDLER_SIZE 9 + #ifndef __ASSEMBLY__ -extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; +extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; #ifdef CONFIG_TRACING -# define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handler_array early_idt_handler_array #endif /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2b55ee6db053..5a4668136e98 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 02d257256200..544dec4cc605 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -478,21 +478,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -524,26 +525,28 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI @@ -603,7 +606,7 @@ ex_entry: .Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 43eafc8afb69..e5c27f729a38 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,26 +321,28 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI @@ -412,7 +414,7 @@ ENTRY(early_idt_handler) .Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA -- cgit v1.2.1 From 039ae58503f3349157a852c2cd5555a630f0bfaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 14 May 2015 13:16:37 +0200 Subject: hwmon: Allow to compile dell-smm-hwmon driver without /proc/i8k MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch splits CONFIG_I8K compile option to SENSORS_DELL_SMM and CONFIG_I8K. Option SENSORS_DELL_SMM is now used to enable compilation of dell-smm-hwmon driver and old CONFIG_I8K option to enable /proc/i8k interface in driver. So this change allows to compile dell-smm-hwmon driver without legacy /proc/i8k interface which is needed only for old Dell Inspirion models or for userspace i8kutils package. For backward compatibility when CONFIG_I8K is enabled then also SENSORS_DELL_SMM is enabled and so driver dell-smm-hwmon (with /proc/i8k) is compiled. Signed-off-by: Pali Rohár Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..7b756b337926 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1055,24 +1055,19 @@ config TOSHIBA Say N otherwise. config I8K - tristate "Dell laptop support" + tristate "Dell i8k legacy laptop support" select HWMON + select SENSORS_DELL_SMM ---help--- - This adds a driver to safely access the System Management Mode - of the CPU on the Dell Inspiron 8000. The System Management Mode - is used to read cpu temperature and cooling fan status and to - control the fans on the I8K portables. + This option enables legacy /proc/i8k userspace interface in hwmon + dell-smm-hwmon driver. Character file /proc/i8k reports bios version, + temperature and allows controlling fan speeds of Dell laptops via + System Management Mode. For old Dell laptops (like Dell Inspiron 8000) + it reports also power and hotkey status. For fan speed control is + needed userspace package i8kutils. - This driver has been tested only on the Inspiron 8000 but it may - also work with other Dell laptops. You can force loading on other - models by passing the parameter `force=1' to the module. Use at - your own risk. - - For information on utilities to make use of this driver see the - I8K Linux utilities web site at: - - - Say Y if you intend to run this kernel on a Dell Inspiron 8000. + Say Y if you intend to run this kernel on old Dell laptops or want to + use userspace package i8kutils. Say N otherwise. config X86_REBOOTFIXUPS -- cgit v1.2.1 From b8c1b8ea7b219a7ba6d58d97bfdf1403b741f8d5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 May 2015 09:58:12 +0200 Subject: x86/fpu: Fix FPU state save area alignment bug On most configs task-struct is cache line aligned, which makes the XSAVE area's 64-byte required alignment work out fine. But on some .config's task_struct is aligned only to 16 bytes (enforced by ARCH_MIN_TASKALIGN), which makes things like fpu__copy() (that XSAVEOPT uses) not work so well. I broke this in: 7366ed771f6e ("x86/fpu: Simplify FPU handling by embedding the fpstate in task_struct (again)") which embedded the fpstate in the task_struct. The alignment requirements of the FPU code were originally present in ARCH_MIN_TASKALIGN, which still has a value of 16, which was the alignment requirement of the FPU state area prior XSAVE. But this link was not documented (and not required) and the link got lost when the FPU state area was made dynamic years ago. With XSAVEOPT the minimum alignment requirment went up to 64 bytes, and the embedding of the FPU state area in task_struct exposed it again - and '16' was not increased to '64'. So fix this bug, but also try to address the underlying lost link of information that made it easier to happen: - document ARCH_MIN_TASKALIGN a bit better - use alignof() to recover the current alignment requirements. This would work in the future as well, should the alignment requirements go up to 128 bytes with things like AVX512. ( We should probably also use the vSMP alignment rules for all of x86, but that's for another patch. ) Reported-by: Peter Zijlstra Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b9e487499ae2..8e04f51d6bea 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -53,11 +53,16 @@ static inline void *current_text_addr(void) return pc; } +/* + * These alignment constraints are for performance in the vSMP case, + * but in the task_struct case we must also meet hardware imposed + * alignment requirements of the FPU state: + */ #ifdef CONFIG_X86_VSMP # define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) # define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) #else -# define ARCH_MIN_TASKALIGN 16 +# define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state) # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -- cgit v1.2.1 From fd169b05415485bf47606d820194c39cb2f53201 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2015 09:55:39 +0200 Subject: x86/fpu: Move the xstate copying functions into fpu/internal.h All the other register<-> memory copying functions are defined in fpu/internal.h, so move the xstate variants there too. Beyond being more consistent, this also allows FPU debugging checks to be added to them. (Because they can now use the macros defined in fpu/internal.h.) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 192 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/fpu/xstate.h | 192 ------------------------------------ 2 files changed, 192 insertions(+), 192 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index a98a08d1efa9..bcc4173a888a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -208,6 +208,198 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) } } +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* xstate instruction fault handler: */ +#define xstate_fault(__err) \ + \ + ".section .fixup,\"ax\"\n" \ + \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + \ + ".previous\n" \ + \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (__err) + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int copy_xregs_to_kernel_booting(struct xregs_state *fx) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XSAVES"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XSAVE"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + return err; +} + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int copy_kernel_to_xregs_booting(struct xregs_state *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XRSTORS"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XRSTOR"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + return err; +} + +/* + * Save processor xstate to xsave area. + */ +static inline int copy_xregs_to_kernel(struct xregs_state *fx) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(!alternatives_patched); + + /* + * If xsaves is enabled, xsaves replaces xsaveopt because + * it supports compact format and supervisor states in addition to + * modified optimization in xsaveopt. + * + * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave + * because xsaveopt supports modified optimization which is not + * supported by xsave. + * + * If none of xsaves and xsaveopt is enabled, use xsave. + */ + alternative_input_2( + "1:"XSAVE, + XSAVEOPT, + X86_FEATURE_XSAVEOPT, + XSAVES, + X86_FEATURE_XSAVES, + [fx] "D" (fx), "a" (lmask), "d" (hmask) : + "memory"); + asm volatile("2:\n\t" + xstate_fault(err) + : "0" (0) + : "memory"); + + return err; +} + +/* + * Restore processor xstate from xsave area. + */ +static inline int copy_kernel_to_xregs(struct xregs_state *fx, u64 mask) +{ + int err = 0; + u32 lmask = mask; + u32 hmask = mask >> 32; + + /* + * Use xrstors to restore context if it is enabled. xrstors supports + * compacted format of xsave area which is not supported by xrstor. + */ + alternative_input( + "1: " XRSTOR, + XRSTORS, + X86_FEATURE_XSAVES, + "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile("2:\n" + xstate_fault(err) + : "0" (0) + : "memory"); + + return err; +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for + * backward compatibility for old applications which don't understand + * compacted format of xsave area. + */ +static inline int copy_xregs_to_user(struct xregs_state __user *buf) +{ + int err; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + err = __clear_user(&buf->header, sizeof(buf->header)); + if (unlikely(err)) + return -EFAULT; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XSAVE"\n" + "2: " ASM_CLAC "\n" + xstate_fault(err) + : "D" (buf), "a" (-1), "d" (-1), "0" (0) + : "memory"); + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) +{ + int err = 0; + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XRSTOR"\n" + "2: " ASM_CLAC "\n" + xstate_fault(err) + : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) + : "memory"); /* memory required? */ + return err; +} + /* * These must be called with preempt disabled. Returns * 'true' if the FPU state is still intact and we can diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index a6e07bb31526..339894669117 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -40,198 +40,6 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" - -/* xstate instruction fault handler: */ -#define xstate_fault(__err) \ - \ - ".section .fixup,\"ax\"\n" \ - \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - \ - ".previous\n" \ - \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (__err) - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int copy_xregs_to_kernel_booting(struct xregs_state *fx) -{ - u64 mask = -1; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XSAVES"\n\t" - "2:\n\t" - xstate_fault(err) - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XSAVE"\n\t" - "2:\n\t" - xstate_fault(err) - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int copy_kernel_to_xregs_booting(struct xregs_state *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XRSTORS"\n\t" - "2:\n\t" - xstate_fault(err) - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XRSTOR"\n\t" - "2:\n\t" - xstate_fault(err) - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * Save processor xstate to xsave area. - */ -static inline int copy_xregs_to_kernel(struct xregs_state *fx) -{ - u64 mask = -1; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(!alternatives_patched); - - /* - * If xsaves is enabled, xsaves replaces xsaveopt because - * it supports compact format and supervisor states in addition to - * modified optimization in xsaveopt. - * - * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave - * because xsaveopt supports modified optimization which is not - * supported by xsave. - * - * If none of xsaves and xsaveopt is enabled, use xsave. - */ - alternative_input_2( - "1:"XSAVE, - XSAVEOPT, - X86_FEATURE_XSAVEOPT, - XSAVES, - X86_FEATURE_XSAVES, - [fx] "D" (fx), "a" (lmask), "d" (hmask) : - "memory"); - asm volatile("2:\n\t" - xstate_fault(err) - : "0" (0) - : "memory"); - - return err; -} - -/* - * Restore processor xstate from xsave area. - */ -static inline int copy_kernel_to_xregs(struct xregs_state *fx, u64 mask) -{ - int err = 0; - u32 lmask = mask; - u32 hmask = mask >> 32; - - /* - * Use xrstors to restore context if it is enabled. xrstors supports - * compacted format of xsave area which is not supported by xrstor. - */ - alternative_input( - "1: " XRSTOR, - XRSTORS, - X86_FEATURE_XSAVES, - "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - - asm volatile("2:\n" - xstate_fault(err) - : "0" (0) - : "memory"); - - return err; -} - -/* - * Save xstate to user space xsave area. - * - * We don't use modified optimization because xrstor/xrstors might track - * a different application. - * - * We don't use compacted format xsave area for - * backward compatibility for old applications which don't understand - * compacted format of xsave area. - */ -static inline int copy_xregs_to_user(struct xregs_state __user *buf) -{ - int err; - - /* - * Clear the xsave header first, so that reserved fields are - * initialized to zero. - */ - err = __clear_user(&buf->header, sizeof(buf->header)); - if (unlikely(err)) - return -EFAULT; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XSAVE"\n" - "2: " ASM_CLAC "\n" - xstate_fault(err) - : "D" (buf), "a" (-1), "d" (-1), "0" (0) - : "memory"); - return err; -} - -/* - * Restore xstate from user space xsave area. - */ -static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) -{ - int err = 0; - struct xregs_state *xstate = ((__force struct xregs_state *)buf); - u32 lmask = mask; - u32 hmask = mask >> 32; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XRSTOR"\n" - "2: " ASM_CLAC "\n" - xstate_fault(err) - : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) - : "memory"); /* memory required? */ - return err; -} - void *get_xsave_addr(struct xregs_state *xsave, int xstate); #endif -- cgit v1.2.1 From 6e5535940fcc2608e58af997236ce5e0391ccfc6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 May 2015 09:58:12 +0200 Subject: x86/fpu: Fix fpu__init_system_xstate() comments Remove obsolete comment about __init limitations: in the new code there aren't any. Also standardize the comment style in the function while at it. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5724098adf1b..ad4dd26ebd59 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -316,8 +316,6 @@ static void __init init_xstate_size(void) /* * Enable and initialize the xsave feature. * Called once per system bootup. - * - * ( Not marked __init because of false positive section warnings. ) */ void __init fpu__init_system_xstate(void) { @@ -345,17 +343,13 @@ void __init fpu__init_system_xstate(void) BUG(); } - /* - * Support only the state known to OS. - */ + /* Support only the state known to the OS: */ xfeatures_mask = xfeatures_mask & XCNTXT_MASK; /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); - /* - * Recompute the context size for enabled features - */ + /* Recompute the context size for enabled features: */ init_xstate_size(); update_regset_xstate_info(xstate_size, xfeatures_mask); -- cgit v1.2.1 From 87dafd41a4423c6f730e5f4b0d56a1aa3fdcf3fc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2015 10:57:06 +0200 Subject: x86/fpu: Rename xstate related 'fx' references to 'xstate' So the xstate code was probably first copied from the fxregs code, hence it carried over the 'fx' naming for the state pointer variable. But this is slightly confusing, as we usually on call the (legacy) MMX/SSE state 'fx', both in data structures and in the functions build around FXSAVE/FXRSTOR. So rename it to 'xstate' to make it more apparent what it is related to. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index bcc4173a888a..d142ecb067b8 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -232,7 +232,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int copy_xregs_to_kernel_booting(struct xregs_state *fx) +static inline int copy_xregs_to_kernel_booting(struct xregs_state *xstate) { u64 mask = -1; u32 lmask = mask; @@ -245,13 +245,13 @@ static inline int copy_xregs_to_kernel_booting(struct xregs_state *fx) asm volatile("1:"XSAVES"\n\t" "2:\n\t" xstate_fault(err) - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XSAVE"\n\t" "2:\n\t" xstate_fault(err) - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) : "memory"); return err; } @@ -260,7 +260,7 @@ static inline int copy_xregs_to_kernel_booting(struct xregs_state *fx) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int copy_kernel_to_xregs_booting(struct xregs_state *fx, u64 mask) +static inline int copy_kernel_to_xregs_booting(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; @@ -272,13 +272,13 @@ static inline int copy_kernel_to_xregs_booting(struct xregs_state *fx, u64 mask) asm volatile("1:"XRSTORS"\n\t" "2:\n\t" xstate_fault(err) - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XRSTOR"\n\t" "2:\n\t" xstate_fault(err) - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) : "memory"); return err; } @@ -286,7 +286,7 @@ static inline int copy_kernel_to_xregs_booting(struct xregs_state *fx, u64 mask) /* * Save processor xstate to xsave area. */ -static inline int copy_xregs_to_kernel(struct xregs_state *fx) +static inline int copy_xregs_to_kernel(struct xregs_state *xstate) { u64 mask = -1; u32 lmask = mask; @@ -312,7 +312,7 @@ static inline int copy_xregs_to_kernel(struct xregs_state *fx) X86_FEATURE_XSAVEOPT, XSAVES, X86_FEATURE_XSAVES, - [fx] "D" (fx), "a" (lmask), "d" (hmask) : + [xstate] "D" (xstate), "a" (lmask), "d" (hmask) : "memory"); asm volatile("2:\n\t" xstate_fault(err) @@ -325,7 +325,7 @@ static inline int copy_xregs_to_kernel(struct xregs_state *fx) /* * Restore processor xstate from xsave area. */ -static inline int copy_kernel_to_xregs(struct xregs_state *fx, u64 mask) +static inline int copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { int err = 0; u32 lmask = mask; @@ -339,7 +339,7 @@ static inline int copy_kernel_to_xregs(struct xregs_state *fx, u64 mask) "1: " XRSTOR, XRSTORS, X86_FEATURE_XSAVES, - "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) : "memory"); asm volatile("2:\n" @@ -426,7 +426,7 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to mark them inactive: */ - asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state.fsave)); + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); return 0; } @@ -459,7 +459,7 @@ static inline int copy_fpstate_to_fpregs(struct fpu *fpu) return __copy_fpstate_to_fpregs(fpu); } -extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fx, int size); +extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); /* * FPU context switch related helper methods: -- cgit v1.2.1 From 87b6559d0a37cd82b4b2ffe38f88c0d4ac6ee7e2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2015 11:27:46 +0200 Subject: x86/fpu: Improve xstate_fault() handling There are two problems with xstate_fault handling: - The xstate_fault() macro takes an argument, but that's propagated into the assembly named label as well. This is technically correct currently but might result in failures if anytime a more complex argument is used. So use a separate '_err' name instead for the label. - All the xstate_fault() using functions have an error variable named 'err', which is an output variable to the asm() they are using. The problem is, it's not always set by the asm(), in which case the compiler might optimize out its initialization, so that the C variable 'err' might become corrupted after the asm() - confusing anyone who tries to take advantage of this variable after the asm(). Mark it an input variable as well. This is a latent bug currently, but an upcoming debug patch will make use of 'err'. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d142ecb067b8..5370500d479e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -220,13 +220,13 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) \ ".section .fixup,\"ax\"\n" \ \ - "3: movl $-1,%[err]\n" \ + "3: movl $-2,%[_err]\n" \ " jmp 2b\n" \ \ ".previous\n" \ \ _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (__err) + : [_err] "=r" (__err) /* * This function is called only during boot time when x86 caps are not set @@ -245,14 +245,14 @@ static inline int copy_xregs_to_kernel_booting(struct xregs_state *xstate) asm volatile("1:"XSAVES"\n\t" "2:\n\t" xstate_fault(err) - : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) - : "memory"); + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); else asm volatile("1:"XSAVE"\n\t" "2:\n\t" xstate_fault(err) - : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) - : "memory"); + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); return err; } @@ -272,14 +272,14 @@ static inline int copy_kernel_to_xregs_booting(struct xregs_state *xstate, u64 m asm volatile("1:"XRSTORS"\n\t" "2:\n\t" xstate_fault(err) - : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) - : "memory"); + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); else asm volatile("1:"XRSTOR"\n\t" "2:\n\t" xstate_fault(err) - : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) - : "memory"); + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); return err; } -- cgit v1.2.1 From 685c9616248c4f0d57e0d81d3236c80bdce1af46 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2015 11:59:35 +0200 Subject: x86/fpu: Improve the initialization logic of 'err' around xstate_fault() constraints There's a confusing aspect of how xstate_fault() constraints are handled by the FPU register/memory copying functions in fpu/internal.h: they use "0" (0) to signal that the asm code will not always set 'err' to a valid value. But 'err' is already initialized to 0 in C code, which is duplicated by the asm() constraint. Should the initialization value ever be changed, it might become subtly inconsistent with the not too clear asm() constraint. Use 'err' as the value of the input variable instead, to clarify this all. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 5370500d479e..1352d380bd46 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -316,7 +316,7 @@ static inline int copy_xregs_to_kernel(struct xregs_state *xstate) "memory"); asm volatile("2:\n\t" xstate_fault(err) - : "0" (0) + : "0" (err) : "memory"); return err; @@ -327,9 +327,9 @@ static inline int copy_xregs_to_kernel(struct xregs_state *xstate) */ static inline int copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { - int err = 0; u32 lmask = mask; u32 hmask = mask >> 32; + int err = 0; /* * Use xrstors to restore context if it is enabled. xrstors supports @@ -344,7 +344,7 @@ static inline int copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) asm volatile("2:\n" xstate_fault(err) - : "0" (0) + : "0" (err) : "memory"); return err; @@ -376,7 +376,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf) "1:"XSAVE"\n" "2: " ASM_CLAC "\n" xstate_fault(err) - : "D" (buf), "a" (-1), "d" (-1), "0" (0) + : "D" (buf), "a" (-1), "d" (-1), "0" (err) : "memory"); return err; } @@ -386,16 +386,16 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf) */ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) { - int err = 0; struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; u32 hmask = mask >> 32; + int err = 0; __asm__ __volatile__(ASM_STAC "\n" "1:"XRSTOR"\n" "2: " ASM_CLAC "\n" xstate_fault(err) - : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) + : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err) : "memory"); /* memory required? */ return err; } -- cgit v1.2.1 From 8c05f05edb7795ecd1fa95d5d44bc5b22fd85287 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 May 2015 09:23:25 +0200 Subject: x86/fpu: Micro-optimize the copy_xregs_to_kernel*() and copy_kernel_to_xregs*() functions The copy_xregs_to_kernel*() and copy_kernel_to_xregs*() functions are used to copy FPU registers to kernel memory and vice versa. They are never expected to fail, yet they have a return code, mostly because that way they can share the assembly macros with the copy*user*() functions. This error code is then silently ignored by the context switching and other code - which made the bug in: b8c1b8ea7b21 ("x86/fpu: Fix FPU state save area alignment bug") harder to fix than necessary. So remove the return values and check for no faults when FPU debugging is enabled in the .config. This improves the eagerfpu context switching fast path by a couple of instructions, when FPU debugging is disabled: ffffffff810407fa: 89 c2 mov %eax,%edx ffffffff810407fc: 48 0f ae 2f xrstor64 (%rdi) ffffffff81040800: 31 c0 xor %eax,%eax -ffffffff81040802: eb 0a jmp ffffffff8104080e <__switch_to+0x321> +ffffffff81040802: eb 16 jmp ffffffff8104081a <__switch_to+0x32d> ffffffff81040804: 31 c0 xor %eax,%eax ffffffff81040806: 48 0f ae 8b c0 05 00 fxrstor64 0x5c0(%rbx) ffffffff8104080d: 00 Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 1352d380bd46..99690bed920a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -232,7 +232,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int copy_xregs_to_kernel_booting(struct xregs_state *xstate) +static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) { u64 mask = -1; u32 lmask = mask; @@ -253,14 +253,16 @@ static inline int copy_xregs_to_kernel_booting(struct xregs_state *xstate) xstate_fault(err) : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) : "memory"); - return err; + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); } /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline int copy_kernel_to_xregs_booting(struct xregs_state *xstate, u64 mask) +static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; @@ -280,13 +282,15 @@ static inline int copy_kernel_to_xregs_booting(struct xregs_state *xstate, u64 m xstate_fault(err) : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) : "memory"); - return err; + + /* We should never fault when copying from a kernel buffer: */ + WARN_ON_FPU(err); } /* * Save processor xstate to xsave area. */ -static inline int copy_xregs_to_kernel(struct xregs_state *xstate) +static inline void copy_xregs_to_kernel(struct xregs_state *xstate) { u64 mask = -1; u32 lmask = mask; @@ -319,13 +323,14 @@ static inline int copy_xregs_to_kernel(struct xregs_state *xstate) : "0" (err) : "memory"); - return err; + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); } /* * Restore processor xstate from xsave area. */ -static inline int copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) +static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; @@ -347,7 +352,8 @@ static inline int copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) : "0" (err) : "memory"); - return err; + /* We should never fault when copying from a kernel buffer: */ + WARN_ON_FPU(err); } /* @@ -433,12 +439,15 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) static inline int __copy_fpstate_to_fpregs(struct fpu *fpu) { - if (use_xsave()) - return copy_kernel_to_xregs(&fpu->state.xsave, -1); - else if (use_fxsr()) - return copy_kernel_to_fxregs(&fpu->state.fxsave); - else - return copy_kernel_to_fregs(&fpu->state.fsave); + if (use_xsave()) { + copy_kernel_to_xregs(&fpu->state.xsave, -1); + return 0; + } else { + if (use_fxsr()) + return copy_kernel_to_fxregs(&fpu->state.fxsave); + else + return copy_kernel_to_fregs(&fpu->state.fsave); + } } static inline int copy_fpstate_to_fpregs(struct fpu *fpu) -- cgit v1.2.1 From 9f6b8029787bb37170d4535e9fc09158f634282c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 17 May 2015 16:20:07 +0200 Subject: KVM: use kvm_memslots whenever possible kvm_memslots provides lockdep checking. Use it consistently instead of explicit dereferencing of kvm->memslots. Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d42d8ace90f1..8918e23e0e8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7782,6 +7782,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { + struct kvm_memslots *slots; struct kvm_memory_slot *new; int nr_mmu_pages = 0; @@ -7803,7 +7804,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); /* It's OK to get 'new' slot here as it has already been installed */ - new = id_to_memslot(kvm->memslots, mem->slot); + slots = kvm_memslots(kvm); + new = id_to_memslot(slots, mem->slot); /* * Dirty logging tracks sptes in 4k granularity, meaning that large -- cgit v1.2.1 From 09170a49422bd786be3eac5cec1955257c5a34b7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2015 13:59:39 +0200 Subject: KVM: const-ify uses of struct kvm_userspace_memory_region Architecture-specific helpers are not supposed to muck with struct kvm_userspace_memory_region contents. Add const to enforce this. In order to eliminate the only write in __kvm_set_memory_region, the cleaning of deleted slots is pulled up from update_memslots to __kvm_set_memory_region. Reviewed-by: Takuya Yoshikawa Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8918e23e0e8e..30854ea218e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7700,7 +7700,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm) int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { /* @@ -7778,7 +7778,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, enum kvm_mr_change change) { -- cgit v1.2.1 From 15f46015ee17681b542432df21747f5c51857156 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 17 May 2015 21:26:08 +0200 Subject: KVM: add memslots argument to kvm_arch_memslots_updated Prepare for the case of multiple address spaces. Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 30854ea218e7..f0aec85f8cdd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7689,7 +7689,7 @@ out_free: return -ENOMEM; } -void kvm_arch_memslots_updated(struct kvm *kvm) +void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) { /* * memslots->generation has been incremented. -- cgit v1.2.1 From b371b594317869971af326adcf7cd65cabdb4087 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:13 +0200 Subject: perf/x86: Fix event/group validation Commit 43b4578071c0 ("perf/x86: Reduce stack usage of x86_schedule_events()") violated the rule that 'fake' scheduling; as used for event/group validation; should not change the event state. This went mostly un-noticed because repeated calls of x86_pmu::get_event_constraints() would give the same result. And x86_pmu::put_event_constraints() would mostly not do anything. Commit e979121b1b15 ("perf/x86/intel: Implement cross-HT corruption bug workaround") made the situation much worse by actually setting the event->hw.constraint value to NULL, so when validation and actual scheduling interact we get NULL ptr derefs. Fix it by removing the constraint pointer from the event and move it back to an array, this time in cpuc instead of on the stack. validate_group() x86_schedule_events() event->hw.constraint = c; # store perf_task_event_sched_in() ... x86_schedule_events(); event->hw.constraint = c2; # store ... put_event_constraints(event); # assume failure to schedule intel_put_event_constraints() event->hw.constraint = NULL; c = event->hw.constraint; # read -> NULL if (!test_bit(hwc->idx, c->idxmsk)) # <- *BOOM* NULL deref This in particular is possible when the event in question is a cpu-wide event and group-leader, where the validate_group() tries to add an event to the group. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Hunter Cc: Linus Torvalds Cc: Maria Dimakopoulou Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 43b4578071c0 ("perf/x86: Reduce stack usage of x86_schedule_events()") Fixes: e979121b1b15 ("perf/x86/intel: Implement cross-HT corruption bug workaround") Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 33 +++++++++++++++------------ arch/x86/kernel/cpu/perf_event.h | 9 ++++---- arch/x86/kernel/cpu/perf_event_intel.c | 15 ++++-------- arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 ++-- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 7 +++--- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 1 + 6 files changed, 33 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 87848ebe2bb7..1664eeea65e0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -620,7 +620,7 @@ struct sched_state { struct perf_sched { int max_weight; int max_events; - struct perf_event **events; + struct event_constraint **constraints; struct sched_state state; int saved_states; struct sched_state saved[SCHED_STATES_MAX]; @@ -629,7 +629,7 @@ struct perf_sched { /* * Initialize interator that runs through all events and counters. */ -static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax) { int idx; @@ -637,10 +637,10 @@ static void perf_sched_init(struct perf_sched *sched, struct perf_event **events memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; - sched->events = events; + sched->constraints = constraints; for (idx = 0; idx < num; idx++) { - if (events[idx]->hw.constraint->weight == wmin) + if (constraints[idx]->weight == wmin) break; } @@ -687,7 +687,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) if (sched->state.event >= sched->max_events) return false; - c = sched->events[sched->state.event]->hw.constraint; + c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; @@ -745,7 +745,7 @@ static bool perf_sched_next_event(struct perf_sched *sched) if (sched->state.weight > sched->max_weight) return false; } - c = sched->events[sched->state.event]->hw.constraint; + c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ @@ -756,12 +756,12 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -int perf_assign_events(struct perf_event **events, int n, +int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int *assign) { struct perf_sched sched; - perf_sched_init(&sched, events, n, wmin, wmax); + perf_sched_init(&sched, constraints, n, wmin, wmax); do { if (!perf_sched_find_counter(&sched)) @@ -788,9 +788,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - hwc = &cpuc->event_list[i]->hw; + cpuc->event_constraint[i] = NULL; c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); - hwc->constraint = c; + cpuc->event_constraint[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); @@ -801,7 +801,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = hwc->constraint; + c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) @@ -821,9 +821,10 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) } /* slow path */ - if (i != n) - unsched = perf_assign_events(cpuc->event_list, n, wmin, + if (i != n) { + unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, assign); + } /* * In case of success (unsched = 0), mark events as committed, @@ -840,7 +841,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) e = cpuc->event_list[i]; e->hw.flags |= PERF_X86_EVENT_COMMITTED; if (x86_pmu.commit_scheduling) - x86_pmu.commit_scheduling(cpuc, e, assign[i]); + x86_pmu.commit_scheduling(cpuc, i, assign[i]); } } @@ -1292,8 +1293,10 @@ static void x86_pmu_del(struct perf_event *event, int flags) x86_pmu.put_event_constraints(cpuc, event); /* Delete the array entry. */ - while (++i < cpuc->n_events) + while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; + cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + } --cpuc->n_events; perf_event_update_userpage(event); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6ac5cb7a9e14..fdfaab7c5e55 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -172,7 +172,10 @@ struct cpu_hw_events { added in the current transaction */ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; + unsigned int group_flag; int is_fake; @@ -519,9 +522,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); - void (*commit_scheduling)(struct cpu_hw_events *cpuc, - struct perf_event *event, - int cntr); + void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); void (*start_scheduling)(struct cpu_hw_events *cpuc); @@ -717,7 +718,7 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); -int perf_assign_events(struct perf_event **events, int n, +int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3998131d1a68..7a58fb5df15c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2106,7 +2106,7 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c1 = event->hw.constraint; + struct event_constraint *c1 = cpuc->event_constraint[idx]; struct event_constraint *c2; /* @@ -2188,8 +2188,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct event_constraint *c = event->hw.constraint; - intel_put_shared_regs_event_constraints(cpuc, event); /* @@ -2197,19 +2195,14 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, * all events are subject to and must call the * put_excl_constraints() routine */ - if (c && cpuc->excl_cntrs) + if (cpuc->excl_cntrs) intel_put_excl_constraints(cpuc, event); - - /* cleanup dynamic constraint */ - if (c && (c->flags & PERF_X86_EVENT_DYNAMIC)) - event->hw.constraint = NULL; } -static void intel_commit_scheduling(struct cpu_hw_events *cpuc, - struct perf_event *event, int cntr) +static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct event_constraint *c = event->hw.constraint; + struct event_constraint *c = cpuc->event_constraint[idx]; struct intel_excl_states *xlo, *xl; int tid = cpuc->excl_thread_id; int o_tid = 1 - tid; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 813f75d71175..7f73b3553e2e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -706,9 +706,9 @@ void intel_pmu_pebs_disable(struct perf_event *event) cpuc->pebs_enabled &= ~(1ULL << hwc->idx); - if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT) + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); - else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST) + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); if (cpuc->enabled) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c635b8b49e93..ec2ba578d286 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -365,9 +365,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { - hwc = &box->event_list[i]->hw; c = uncore_get_event_constraint(box, box->event_list[i]); - hwc->constraint = c; + box->event_constraint[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } @@ -375,7 +374,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int /* fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { hwc = &box->event_list[i]->hw; - c = hwc->constraint; + c = box->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) @@ -395,7 +394,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int } /* slow path */ if (i != n) - ret = perf_assign_events(box->event_list, n, + ret = perf_assign_events(box->event_constraint, n, wmin, wmax, assign); if (!assign || ret) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 6c8c1e7e69d8..f789ec9a0133 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -97,6 +97,7 @@ struct intel_uncore_box { atomic_t refcnt; struct perf_event *events[UNCORE_PMC_IDX_MAX]; struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX]; unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; u64 tags[UNCORE_PMC_IDX_MAX]; struct pci_dev *pci_dev; -- cgit v1.2.1 From cc1790cf541553263bda024295d7600c7cd7c45d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:17 +0200 Subject: perf/x86: Improve HT workaround GP counter constraint The (SNB/IVB/HSW) HT bug only affects events that can be programmed onto GP counters, therefore we should only limit the number of GP counters that can be used per cpu -- iow we should not constrain the FP counters. Furthermore, we should only enfore such a limit when there are in fact exclusive events being scheduled on either sibling. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner [ Fixed build fail for the !CONFIG_CPU_SUP_INTEL case. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 36 ++++++++++++++++++++++----- arch/x86/kernel/cpu/perf_event.h | 15 ++++++++--- arch/x86/kernel/cpu/perf_event_intel.c | 30 ++++++++-------------- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 4 files changed, 53 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1664eeea65e0..2eca19422454 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -611,6 +611,7 @@ struct sched_state { int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ + int nr_gp; /* number of GP counters used */ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; @@ -620,9 +621,10 @@ struct sched_state { struct perf_sched { int max_weight; int max_events; + int max_gp; + int saved_states; struct event_constraint **constraints; struct sched_state state; - int saved_states; struct sched_state saved[SCHED_STATES_MAX]; }; @@ -630,13 +632,14 @@ struct perf_sched { * Initialize interator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, - int num, int wmin, int wmax) + int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; + sched->max_gp = gpmax; sched->constraints = constraints; for (idx = 0; idx < num; idx++) { @@ -696,11 +699,16 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) goto done; } } + /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { - if (!__test_and_set_bit(idx, sched->state.used)) + if (!__test_and_set_bit(idx, sched->state.used)) { + if (sched->state.nr_gp++ >= sched->max_gp) + return false; + goto done; + } } return false; @@ -757,11 +765,11 @@ static bool perf_sched_next_event(struct perf_sched *sched) * Assign a counter for each event. */ int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) + int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; - perf_sched_init(&sched, constraints, n, wmin, wmax); + perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) @@ -822,8 +830,24 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) { + int gpmax = x86_pmu.num_counters; + + /* + * Do not allow scheduling of more than half the available + * generic counters. + * + * This helps avoid counter starvation of sibling thread by + * ensuring at most half the counters cannot be in exclusive + * mode. There is no designated counters for the limits. Any + * N/2 counters can be used. This helps with events with + * specific counter constraints. + */ + if (is_ht_workaround_enabled() && !cpuc->is_fake && + READ_ONCE(cpuc->excl_cntrs->exclusive_present)) + gpmax /= 2; + unsched = perf_assign_events(cpuc->event_constraint, n, wmin, - wmax, assign); + wmax, gpmax, assign); } /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fdfaab7c5e55..ef78516850fb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -74,6 +74,7 @@ struct event_constraint { #define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ #define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ +#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ struct amd_nb { @@ -134,8 +135,6 @@ enum intel_excl_state_type { struct intel_excl_states { enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; enum intel_excl_state_type state[X86_PMC_IDX_MAX]; - int num_alloc_cntrs;/* #counters allocated */ - int max_alloc_cntrs;/* max #counters allowed */ bool sched_started; /* true if scheduling has started */ }; @@ -144,6 +143,11 @@ struct intel_excl_cntrs { struct intel_excl_states states[2]; + union { + u16 has_exclusive[2]; + u32 exclusive_present; + }; + int refcnt; /* per-core: #HT threads */ unsigned core_id; /* per-core: core id */ }; @@ -176,6 +180,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; + int n_excl; /* the number of exclusive events */ unsigned int group_flag; int is_fake; @@ -719,7 +724,7 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign); + int wmin, int wmax, int gpmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); @@ -930,4 +935,8 @@ static inline struct intel_shared_regs *allocate_shared_regs(int cpu) return NULL; } +static inline int is_ht_workaround_enabled(void) +{ + return 0; +} #endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7a58fb5df15c..a1e35c9f06b9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) xl = &excl_cntrs->states[tid]; xl->sched_started = true; - xl->num_alloc_cntrs = 0; /* * lock shared state until we are done scheduling * in stop_event_scheduling() @@ -2000,6 +1999,11 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * across HT threads */ is_excl = c->flags & PERF_X86_EVENT_EXCL; + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; + if (!cpuc->n_excl++) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); + } /* * xl = state of current HT @@ -2008,18 +2012,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; - /* - * do not allow scheduling of more than max_alloc_cntrs - * which is set to half the available generic counters. - * this helps avoid counter starvation of sibling thread - * by ensuring at most half the counters cannot be in - * exclusive mode. There is not designated counters for the - * limits. Any N/2 counters can be used. This helps with - * events with specifix counter constraints - */ - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs) - return &emptyconstraint; - cx = c; /* @@ -2150,6 +2142,11 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; + if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { + hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; + if (!--cpuc->n_excl) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0); + } /* * put_constraint may be called from x86_schedule_events() @@ -2632,8 +2629,6 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { - int h = x86_pmu.num_counters >> 1; - for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_excl_cntrs *c; @@ -2647,11 +2642,6 @@ static void intel_pmu_cpu_starting(int cpu) } cpuc->excl_cntrs->core_id = core_id; cpuc->excl_cntrs->refcnt++; - /* - * set hard limit to half the number of generic counters - */ - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h; - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h; } } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index ec2ba578d286..dd319e59246b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -395,7 +395,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int /* slow path */ if (i != n) ret = perf_assign_events(box->event_constraint, n, - wmin, wmax, assign); + wmin, wmax, n, assign); if (!assign || ret) { for (i = 0; i < n; i++) -- cgit v1.2.1 From f73ec48c90016f89d05726f6c48e66991a790fd7 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 22 May 2015 18:30:22 +0300 Subject: perf/x86/intel/pt: Untangle pt_buffer_reset_markers() Currently, pt_buffer_reset_markers() is a difficult to read knot of arithmetics with a redundant check for multiple-entry TOPA capability, a commented out wakeup marker placement and a logical error wrt to stop marker placement. The latter happens when write head is not page aligned and results in stop marker being placed one page earlier than it actually should. All these problems only affect PT implementations that support multiple-entry TOPA tables (read: proper scatter-gather). For single-entry TOPA implementations, there is no functional impact. This patch deals with all of the above. Tested on both single-entry and multiple-entry TOPA PT implementations. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1432308626-18845-4-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 34 ++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index ffe666c2c6b5..5b804f96ad66 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -615,7 +615,8 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, struct perf_output_handle *handle) { - unsigned long idx, npages, end; + unsigned long head = local64_read(&buf->head); + unsigned long idx, npages, wakeup; if (buf->snapshot) return 0; @@ -634,17 +635,26 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, buf->topa_index[buf->stop_pos]->stop = 0; buf->topa_index[buf->intr_pos]->intr = 0; - if (pt_cap_get(PT_CAP_topa_multiple_entries)) { - npages = (handle->size + 1) >> PAGE_SHIFT; - end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages; - /*if (end > handle->wakeup >> PAGE_SHIFT) - end = handle->wakeup >> PAGE_SHIFT;*/ - idx = end & (buf->nr_pages - 1); - buf->stop_pos = idx; - idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1; - idx &= buf->nr_pages - 1; - buf->intr_pos = idx; - } + /* how many pages till the STOP marker */ + npages = handle->size >> PAGE_SHIFT; + + /* if it's on a page boundary, fill up one more page */ + if (!offset_in_page(head + handle->size + 1)) + npages++; + + idx = (head >> PAGE_SHIFT) + npages; + idx &= buf->nr_pages - 1; + buf->stop_pos = idx; + + wakeup = handle->wakeup >> PAGE_SHIFT; + + /* in the worst case, wake up the consumer one page before hard stop */ + idx = (head >> PAGE_SHIFT) + npages - 1; + if (idx > wakeup) + idx = wakeup; + + idx &= buf->nr_pages - 1; + buf->intr_pos = idx; buf->topa_index[buf->stop_pos]->stop = 1; buf->topa_index[buf->intr_pos]->intr = 1; -- cgit v1.2.1 From 68ab747604da98f0a0414f197f346ac22888fcee Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 18 May 2015 15:16:48 -0400 Subject: perf/x86: Tweak broken BIOS rules during check_hw_exists() I stumbled upon an AMD box that had the BIOS using a hardware performance counter. Instead of printing out a warning and continuing, it failed and blocked further perf counter usage. Looking through the history, I found this commit: a5ebe0ba3dff ("perf/x86: Check all MSRs before passing hw check") which tweaked the rules for a Xen guest on an almost identical box and now changed the behaviour. Unfortunately the rules were tweaked incorrectly and will always lead to MSR failures even though the MSRs are completely fine. What happens now is in arch/x86/kernel/cpu/perf_event.c::check_hw_exists(): for (i = 0; i < x86_pmu.num_counters; i++) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { bios_fail = 1; val_fail = val; reg_fail = reg; } } /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ reg = x86_pmu_event_addr(0); ^^^^ if the first perf counter is enabled, then this routine will always fail because the counter is running. :-( if (rdmsrl_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; The above bios_fail used to be a 'goto' which is why it worked in the past. Further, most vendors have migrated to using fixed counters to hide their evilness hence this problem rarely shows up now days except on a few old boxes. I fixed my problem and kept the spirit of the original Xen fix, by recording a safe non-enable register to be used safely for the reading/writing check. Because it is not enabled, this passes on bare metal boxes (like metal), but should continue to throw an msr_fail on Xen guests because the register isn't emulated yet. Now I get a proper bios_fail error message and Xen should still see their msr_fail message (untested). Signed-off-by: Don Zickus Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: george.dunlap@eu.citrix.com Cc: konrad.wilk@oracle.com Link: http://lkml.kernel.org/r/1431976608-56970-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2eca19422454..4f7001f28936 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -190,6 +190,7 @@ static bool check_hw_exists(void) u64 val, val_fail, val_new= ~0; int i, reg, reg_fail, ret = 0; int bios_fail = 0; + int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so @@ -204,6 +205,8 @@ static bool check_hw_exists(void) bios_fail = 1; val_fail = val; reg_fail = reg; + } else { + reg_safe = i; } } @@ -221,12 +224,23 @@ static bool check_hw_exists(void) } } + /* + * If all the counters are enabled, the below test will always + * fail. The tools will also become useless in this scenario. + * Just fail and disable the hardware counters. + */ + + if (reg_safe == -1) { + reg = reg_safe; + goto msr_fail; + } + /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - reg = x86_pmu_event_addr(0); + reg = x86_pmu_event_addr(reg_safe); if (rdmsrl_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; -- cgit v1.2.1 From f4d9757ca6f5a2db6919a5b1ab86b8afa16773d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 00:00:50 +0000 Subject: perf/x86/intel/cqm: Document PQR MSR abuse The CQM code acts like it owns the PQR MSR completely. That's not true because only the lower 10 bits are used for CQM. The upper 32 bits are used for the 'CLass Of Service ID' (CLOSID). Document the abuse. Will be fixed in a later patch. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Matt Fleming Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vikas Shivappa Cc: Will Auld Link: http://lkml.kernel.org/r/20150518235149.823214798@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index e4d1b8b738fa..572582e2143e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -978,7 +978,12 @@ static void intel_cqm_event_start(struct perf_event *event, int mode) WARN_ON_ONCE(state->rmid); state->rmid = rmid; - wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid); + /* + * This is actually wrong, as the upper 32 bit MSR contain the + * closid which is used for configuring the Cache Allocation + * Technology component. + */ + wrmsr(MSR_IA32_PQR_ASSOC, rmid, 0); raw_spin_unlock_irqrestore(&state->lock, flags); } @@ -998,7 +1003,13 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode) if (!--state->cnt) { state->rmid = 0; - wrmsrl(MSR_IA32_PQR_ASSOC, 0); + /* + * This is actually wrong, as the upper 32 bit of the + * MSR contain the closid which is used for + * configuring the Cache Allocation Technology + * component. + */ + wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); } else { WARN_ON_ONCE(!state->rmid); } -- cgit v1.2.1 From b3df4ec4424f27e55d754cfe586195fecca1c4e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 00:00:51 +0000 Subject: perf/x86/intel/cqm: Use proper data types 'int' is really not a proper data type for an MSR. Use u32 to make it clear that we are dealing with a 32-bit unsigned hardware value. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Matt Fleming Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vikas Shivappa Cc: Will Auld Link: http://lkml.kernel.org/r/20150518235149.919350144@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 572582e2143e..3e9a7fbfce58 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -18,7 +18,7 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */ struct intel_cqm_state { raw_spinlock_t lock; - int rmid; + u32 rmid; int cnt; }; @@ -962,7 +962,7 @@ out: static void intel_cqm_event_start(struct perf_event *event, int mode) { struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); - unsigned int rmid = event->hw.cqm_rmid; + u32 rmid = event->hw.cqm_rmid; unsigned long flags; if (!(event->hw.cqm_state & PERF_HES_STOPPED)) -- cgit v1.2.1 From 9e7eaac95af6c1aecaf558b8c7a1757d5f2d2ad7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 00:00:53 +0000 Subject: perf/x86/intel/cqm: Remove pointless spinlock from state cache 'struct intel_cqm_state' is a strict per CPU cache of the rmid and the usage counter. It can never be modified from a remote CPU. The three functions which modify the content: intel_cqm_event[start|stop|del] (del maps to stop) are called from the perf core with interrupts disabled which is enough protection for the per CPU state values. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Matt Fleming Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vikas Shivappa Cc: Will Auld Link: http://lkml.kernel.org/r/20150518235150.001006529@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 3e9a7fbfce58..63391f860175 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -17,11 +17,16 @@ static unsigned int cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ struct intel_cqm_state { - raw_spinlock_t lock; u32 rmid; int cnt; }; +/* + * The cached intel_cqm_state is strictly per CPU and can never be + * updated from a remote CPU. Both functions which modify the state + * (intel_cqm_event_start and intel_cqm_event_stop) are called with + * interrupts disabled, which is sufficient for the protection. + */ static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); /* @@ -963,15 +968,12 @@ static void intel_cqm_event_start(struct perf_event *event, int mode) { struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); u32 rmid = event->hw.cqm_rmid; - unsigned long flags; if (!(event->hw.cqm_state & PERF_HES_STOPPED)) return; event->hw.cqm_state &= ~PERF_HES_STOPPED; - raw_spin_lock_irqsave(&state->lock, flags); - if (state->cnt++) WARN_ON_ONCE(state->rmid != rmid); else @@ -984,21 +986,17 @@ static void intel_cqm_event_start(struct perf_event *event, int mode) * Technology component. */ wrmsr(MSR_IA32_PQR_ASSOC, rmid, 0); - - raw_spin_unlock_irqrestore(&state->lock, flags); } static void intel_cqm_event_stop(struct perf_event *event, int mode) { struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); - unsigned long flags; if (event->hw.cqm_state & PERF_HES_STOPPED) return; event->hw.cqm_state |= PERF_HES_STOPPED; - raw_spin_lock_irqsave(&state->lock, flags); intel_cqm_event_read(event); if (!--state->cnt) { @@ -1013,8 +1011,6 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode) } else { WARN_ON_ONCE(!state->rmid); } - - raw_spin_unlock_irqrestore(&state->lock, flags); } static int intel_cqm_event_add(struct perf_event *event, int mode) @@ -1257,7 +1253,6 @@ static void intel_cqm_cpu_prepare(unsigned int cpu) struct intel_cqm_state *state = &per_cpu(cqm_state, cpu); struct cpuinfo_x86 *c = &cpu_data(cpu); - raw_spin_lock_init(&state->lock); state->rmid = 0; state->cnt = 0; -- cgit v1.2.1 From 0bac237845e203dd1439cfc571b1baf1b2274b3b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 00:00:55 +0000 Subject: perf/x86/intel/cqm: Avoid pointless MSR write If the usage counter is non-zero there is no point to update the rmid in the PQR MSR. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Matt Fleming Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vikas Shivappa Cc: Will Auld Link: http://lkml.kernel.org/r/20150518235150.080844281@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 63391f860175..2ce69c0953ab 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -974,10 +974,12 @@ static void intel_cqm_event_start(struct perf_event *event, int mode) event->hw.cqm_state &= ~PERF_HES_STOPPED; - if (state->cnt++) - WARN_ON_ONCE(state->rmid != rmid); - else + if (state->cnt++) { + if (!WARN_ON_ONCE(state->rmid != rmid)) + return; + } else { WARN_ON_ONCE(state->rmid); + } state->rmid = rmid; /* -- cgit v1.2.1 From 43d0c2f6dcd07ffc0de658a7fbeeb63c806e9caa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 00:00:56 +0000 Subject: perf/x86/intel/cqm: Remove useless wrapper function intel_cqm_event_del() is a 1:1 wrapper for intel_cqm_event_stop(). Remove the useless indirection. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Matt Fleming Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vikas Shivappa Cc: Will Auld Link: http://lkml.kernel.org/r/20150518235150.159779847@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 2ce69c0953ab..8241b64d34c4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -1033,11 +1033,6 @@ static int intel_cqm_event_add(struct perf_event *event, int mode) return 0; } -static void intel_cqm_event_del(struct perf_event *event, int mode) -{ - intel_cqm_event_stop(event, mode); -} - static void intel_cqm_event_destroy(struct perf_event *event) { struct perf_event *group_other = NULL; @@ -1230,7 +1225,7 @@ static struct pmu intel_cqm_pmu = { .task_ctx_nr = perf_sw_context, .event_init = intel_cqm_event_init, .add = intel_cqm_event_add, - .del = intel_cqm_event_del, + .del = intel_cqm_event_stop, .start = intel_cqm_event_start, .stop = intel_cqm_event_stop, .read = intel_cqm_event_read, -- cgit v1.2.1 From bf926731e1585ccad029ca2fad1444fee082b78d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 00:00:58 +0000 Subject: perf/x86/intel/cqm: Add storage for 'closid' and clean up 'struct intel_pqr_state' 'closid' (CLass Of Service ID) is used for the Class based Cache Allocation Technology (CAT). Add explicit storage to the per cpu cache for it, so it can be used later with the CAT support (requires to move the per cpu data). While at it: - Rename the structure to intel_pqr_state which reflects the actual purpose of the struct: cache values which go into the PQR MSR - Rename 'cnt' to rmid_usecnt which reflects the actual purpose of the counter. - Document the structure and the struct members. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Matt Fleming Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vikas Shivappa Cc: Will Auld Link: http://lkml.kernel.org/r/20150518235150.240899319@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 50 ++++++++++++++++-------------- 1 file changed, 27 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 8241b64d34c4..8233b29bdd35 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -16,18 +16,32 @@ static unsigned int cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ -struct intel_cqm_state { +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @rmid: The cached Resource Monitoring ID + * @closid: The cached Class Of Service ID + * @rmid_usecnt: The usage counter for rmid + * + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { u32 rmid; - int cnt; + u32 closid; + int rmid_usecnt; }; /* - * The cached intel_cqm_state is strictly per CPU and can never be + * The cached intel_pqr_state is strictly per CPU and can never be * updated from a remote CPU. Both functions which modify the state * (intel_cqm_event_start and intel_cqm_event_stop) are called with * interrupts disabled, which is sufficient for the protection. */ -static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); +static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. @@ -966,7 +980,7 @@ out: static void intel_cqm_event_start(struct perf_event *event, int mode) { - struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); u32 rmid = event->hw.cqm_rmid; if (!(event->hw.cqm_state & PERF_HES_STOPPED)) @@ -974,7 +988,7 @@ static void intel_cqm_event_start(struct perf_event *event, int mode) event->hw.cqm_state &= ~PERF_HES_STOPPED; - if (state->cnt++) { + if (state->rmid_usecnt++) { if (!WARN_ON_ONCE(state->rmid != rmid)) return; } else { @@ -982,17 +996,12 @@ static void intel_cqm_event_start(struct perf_event *event, int mode) } state->rmid = rmid; - /* - * This is actually wrong, as the upper 32 bit MSR contain the - * closid which is used for configuring the Cache Allocation - * Technology component. - */ - wrmsr(MSR_IA32_PQR_ASSOC, rmid, 0); + wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); } static void intel_cqm_event_stop(struct perf_event *event, int mode) { - struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); if (event->hw.cqm_state & PERF_HES_STOPPED) return; @@ -1001,15 +1010,9 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode) intel_cqm_event_read(event); - if (!--state->cnt) { + if (!--state->rmid_usecnt) { state->rmid = 0; - /* - * This is actually wrong, as the upper 32 bit of the - * MSR contain the closid which is used for - * configuring the Cache Allocation Technology - * component. - */ - wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); + wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); } else { WARN_ON_ONCE(!state->rmid); } @@ -1247,11 +1250,12 @@ static inline void cqm_pick_event_reader(int cpu) static void intel_cqm_cpu_prepare(unsigned int cpu) { - struct intel_cqm_state *state = &per_cpu(cqm_state, cpu); + struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); struct cpuinfo_x86 *c = &cpu_data(cpu); state->rmid = 0; - state->cnt = 0; + state->closid = 0; + state->rmid_usecnt = 0; WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); -- cgit v1.2.1 From adafa99960ef18b019f001ddee4d9d81c4e25944 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 22 May 2015 09:59:42 +0100 Subject: perf/x86/intel/cqm: Use 'u32' data type for RMIDs Since we write RMID values to MSRs the correct type to use is 'u32' because that clearly articulates we're writing a hardware register value. Fix up all uses of RMID in this code to consistently use the correct data type. Reported-by: Thomas Gleixner Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vikas Shivappa Cc: Will Auld Link: http://lkml.kernel.org/r/1432285182-17180-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 37 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 8233b29bdd35..188076161c1b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -13,7 +13,7 @@ #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d -static unsigned int cqm_max_rmid = -1; +static u32 cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ /** @@ -76,7 +76,7 @@ static cpumask_t cqm_cpumask; * near-zero occupancy value, i.e. no cachelines are tagged with this * RMID, once __intel_cqm_rmid_rotate() returns. */ -static unsigned int intel_cqm_rotation_rmid; +static u32 intel_cqm_rotation_rmid; #define INVALID_RMID (-1) @@ -88,7 +88,7 @@ static unsigned int intel_cqm_rotation_rmid; * Likewise, an rmid value of -1 is used to indicate "no rmid currently * assigned" and is used as part of the rotation code. */ -static inline bool __rmid_valid(unsigned int rmid) +static inline bool __rmid_valid(u32 rmid) { if (!rmid || rmid == INVALID_RMID) return false; @@ -96,7 +96,7 @@ static inline bool __rmid_valid(unsigned int rmid) return true; } -static u64 __rmid_read(unsigned int rmid) +static u64 __rmid_read(u32 rmid) { u64 val; @@ -121,7 +121,7 @@ enum rmid_recycle_state { }; struct cqm_rmid_entry { - unsigned int rmid; + u32 rmid; enum rmid_recycle_state state; struct list_head list; unsigned long queue_time; @@ -166,7 +166,7 @@ static LIST_HEAD(cqm_rmid_limbo_lru); */ static struct cqm_rmid_entry **cqm_rmid_ptrs; -static inline struct cqm_rmid_entry *__rmid_entry(int rmid) +static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) { struct cqm_rmid_entry *entry; @@ -181,7 +181,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(int rmid) * * We expect to be called with cache_mutex held. */ -static int __get_rmid(void) +static u32 __get_rmid(void) { struct cqm_rmid_entry *entry; @@ -196,7 +196,7 @@ static int __get_rmid(void) return entry->rmid; } -static void __put_rmid(unsigned int rmid) +static void __put_rmid(u32 rmid) { struct cqm_rmid_entry *entry; @@ -391,7 +391,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b) } struct rmid_read { - unsigned int rmid; + u32 rmid; atomic64_t value; }; @@ -400,12 +400,11 @@ static void __intel_cqm_event_count(void *info); /* * Exchange the RMID of a group of events. */ -static unsigned int -intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid) +static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) { struct perf_event *event; - unsigned int old_rmid = group->hw.cqm_rmid; struct list_head *head = &group->hw.cqm_group_entry; + u32 old_rmid = group->hw.cqm_rmid; lockdep_assert_held(&cache_mutex); @@ -470,7 +469,7 @@ static void intel_cqm_stable(void *arg) * If we have group events waiting for an RMID that don't conflict with * events already running, assign @rmid. */ -static bool intel_cqm_sched_in_event(unsigned int rmid) +static bool intel_cqm_sched_in_event(u32 rmid) { struct perf_event *leader, *event; @@ -617,7 +616,7 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available) static void __intel_cqm_pick_and_rotate(struct perf_event *next) { struct perf_event *rotor; - unsigned int rmid; + u32 rmid; lockdep_assert_held(&cache_mutex); @@ -645,7 +644,7 @@ static void __intel_cqm_pick_and_rotate(struct perf_event *next) static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) { struct perf_event *group, *g; - unsigned int rmid; + u32 rmid; lockdep_assert_held(&cache_mutex); @@ -847,8 +846,8 @@ static void intel_cqm_setup_event(struct perf_event *event, struct perf_event **group) { struct perf_event *iter; - unsigned int rmid; bool conflict = false; + u32 rmid; list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { rmid = iter->hw.cqm_rmid; @@ -879,7 +878,7 @@ static void intel_cqm_setup_event(struct perf_event *event, static void intel_cqm_event_read(struct perf_event *event) { unsigned long flags; - unsigned int rmid; + u32 rmid; u64 val; /* @@ -1021,7 +1020,7 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode) static int intel_cqm_event_add(struct perf_event *event, int mode) { unsigned long flags; - unsigned int rmid; + u32 rmid; raw_spin_lock_irqsave(&cache_lock, flags); @@ -1064,7 +1063,7 @@ static void intel_cqm_event_destroy(struct perf_event *event) list_replace(&event->hw.cqm_groups_entry, &group_other->hw.cqm_groups_entry); } else { - unsigned int rmid = event->hw.cqm_rmid; + u32 rmid = event->hw.cqm_rmid; if (__rmid_valid(rmid)) __put_rmid(rmid); -- cgit v1.2.1 From 1c565833ac7950a5ddb3322e9848e0628ceba3b5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:21 +0200 Subject: perf/x86/intel: Correct local vs remote sibling state For some obscure reason the current code accounts the current SMT thread's state on the remote thread and reads the remote's state on the local SMT thread. While internally consistent, and 'correct' its pointless confusion we can do without. Flip them the right way around. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 79 ++++++++++++++-------------------- 1 file changed, 33 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a1e35c9f06b9..0ea040562bb8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1903,9 +1903,8 @@ static void intel_start_scheduling(struct cpu_hw_events *cpuc) { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xl, *xlo; + struct intel_excl_states *xl; int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; /* sibling thread */ /* * nothing needed if in group validation mode @@ -1919,7 +1918,6 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) if (!excl_cntrs) return; - xlo = &excl_cntrs->states[o_tid]; xl = &excl_cntrs->states[tid]; xl->sched_started = true; @@ -1932,18 +1930,17 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) raw_spin_lock(&excl_cntrs->lock); /* - * save initial state of sibling thread + * Save a copy of our state to work on. */ - memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state)); + memcpy(xl->init_state, xl->state, sizeof(xl->init_state)); } static void intel_stop_scheduling(struct cpu_hw_events *cpuc) { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xl, *xlo; + struct intel_excl_states *xl; int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; /* sibling thread */ /* * nothing needed if in group validation mode @@ -1956,13 +1953,12 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc) if (!excl_cntrs) return; - xlo = &excl_cntrs->states[o_tid]; xl = &excl_cntrs->states[tid]; /* - * make new sibling thread state visible + * Commit the working state. */ - memcpy(xlo->state, xlo->init_state, sizeof(xlo->state)); + memcpy(xl->state, xl->init_state, sizeof(xl->state)); xl->sched_started = false; /* @@ -1977,10 +1973,9 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, { struct event_constraint *cx; struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xl, *xlo; - int is_excl, i; + struct intel_excl_states *xlo; int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; /* alternate */ + int is_excl, i; /* * validating a group does not require @@ -1994,23 +1989,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, */ if (!excl_cntrs) return c; - /* - * event requires exclusive counter access - * across HT threads - */ - is_excl = c->flags & PERF_X86_EVENT_EXCL; - if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { - event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; - if (!cpuc->n_excl++) - WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); - } - - /* - * xl = state of current HT - * xlo = state of sibling HT - */ - xl = &excl_cntrs->states[tid]; - xlo = &excl_cntrs->states[o_tid]; cx = c; @@ -2053,6 +2031,22 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * of this function */ + /* + * state of sibling HT + */ + xlo = &excl_cntrs->states[tid ^ 1]; + + /* + * event requires exclusive counter access + * across HT threads + */ + is_excl = c->flags & PERF_X86_EVENT_EXCL; + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; + if (!cpuc->n_excl++) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); + } + /* * Modify static constraint with current dynamic * state of thread @@ -2067,14 +2061,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * our corresponding counter cannot be used * regardless of our event */ - if (xl->state[i] == INTEL_EXCL_EXCLUSIVE) + if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) __clear_bit(i, cx->idxmsk); /* * if measuring an exclusive event, sibling * measuring non-exclusive, then counter cannot * be used */ - if (is_excl && xl->state[i] == INTEL_EXCL_SHARED) + if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) __clear_bit(i, cx->idxmsk); } @@ -2124,10 +2118,9 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, { struct hw_perf_event *hwc = &event->hw; struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xlo, *xl; - unsigned long flags = 0; /* keep compiler happy */ int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; + struct intel_excl_states *xl; + unsigned long flags = 0; /* keep compiler happy */ /* * nothing needed if in group validation mode @@ -2141,7 +2134,6 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, return; xl = &excl_cntrs->states[tid]; - xlo = &excl_cntrs->states[o_tid]; if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; if (!--cpuc->n_excl) @@ -2161,7 +2153,7 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, * counter state as unused now */ if (hwc->idx >= 0) - xlo->state[hwc->idx] = INTEL_EXCL_UNUSED; + xl->state[hwc->idx] = INTEL_EXCL_UNUSED; if (!xl->sched_started) raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags); @@ -2200,16 +2192,12 @@ static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cnt { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct event_constraint *c = cpuc->event_constraint[idx]; - struct intel_excl_states *xlo, *xl; + struct intel_excl_states *xl; int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; - int is_excl; if (cpuc->is_fake || !c) return; - is_excl = c->flags & PERF_X86_EVENT_EXCL; - if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) return; @@ -2219,15 +2207,14 @@ static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cnt return; xl = &excl_cntrs->states[tid]; - xlo = &excl_cntrs->states[o_tid]; WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock)); if (cntr >= 0) { - if (is_excl) - xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; + if (c->flags & PERF_X86_EVENT_EXCL) + xl->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; else - xlo->init_state[cntr] = INTEL_EXCL_SHARED; + xl->init_state[cntr] = INTEL_EXCL_SHARED; } } -- cgit v1.2.1 From b32ed7f5de262b10633bb6c6dbb7f8ba46598cf4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 12:38:21 +0200 Subject: perf/x86/intel: Add lockdep assert Lockdep is very good at finding incorrect IRQ state while locking and is far better at telling us if we hold a lock than the _is_locked() API. It also generates less code for !DEBUG kernels. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 0ea040562bb8..5182cee16367 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1926,7 +1926,6 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) * in stop_event_scheduling() * makes scheduling appear as a transaction */ - WARN_ON_ONCE(!irqs_disabled()); raw_spin_lock(&excl_cntrs->lock); /* @@ -2208,7 +2207,7 @@ static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cnt xl = &excl_cntrs->states[tid]; - WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock)); + lockdep_assert_held(&excl_cntrs->lock); if (cntr >= 0) { if (c->flags & PERF_X86_EVENT_EXCL) -- cgit v1.2.1 From aaf932e8161e45291cc85085b6d850f1bbdf53c8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:24 +0200 Subject: perf/x86/intel: Simplify the dynamic constraint code somewhat We have two 'struct event_constraint' local variables in intel_get_excl_constraints(): 'cx' and 'c'. Instead of using 'cx' after the dynamic allocation, put all 'cx' inside the dynamic allocation block and use 'c' outside of it. Also use direct assignment to copy the structure; let the compiler figure it out. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5182cee16367..9588b8d1264d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1970,7 +1970,6 @@ static struct event_constraint * intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, int idx, struct event_constraint *c) { - struct event_constraint *cx; struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct intel_excl_states *xlo; int tid = cpuc->excl_thread_id; @@ -1989,8 +1988,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, if (!excl_cntrs) return c; - cx = c; - /* * because we modify the constraint, we need * to make a copy. Static constraints come @@ -2000,6 +1997,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * been cloned (marked dynamic) */ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { + struct event_constraint *cx; /* sanity check */ if (idx < 0) @@ -2014,13 +2012,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * initialize dynamic constraint * with static constraint */ - memcpy(cx, c, sizeof(*cx)); + *cx = *c; /* * mark constraint as dynamic, so we * can free it later on */ cx->flags |= PERF_X86_EVENT_DYNAMIC; + c = cx; } /* @@ -2054,37 +2053,37 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * SHARED : sibling counter measuring non-exclusive event * UNUSED : sibling counter unused */ - for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) { /* * exclusive event in sibling counter * our corresponding counter cannot be used * regardless of our event */ if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) - __clear_bit(i, cx->idxmsk); + __clear_bit(i, c->idxmsk); /* * if measuring an exclusive event, sibling * measuring non-exclusive, then counter cannot * be used */ if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) - __clear_bit(i, cx->idxmsk); + __clear_bit(i, c->idxmsk); } /* * recompute actual bit weight for scheduling algorithm */ - cx->weight = hweight64(cx->idxmsk64); + c->weight = hweight64(c->idxmsk64); /* * if we return an empty mask, then switch * back to static empty constraint to avoid * the cost of freeing later on */ - if (cx->weight == 0) - cx = &emptyconstraint; + if (c->weight == 0) + c = &emptyconstraint; - return cx; + return c; } static struct event_constraint * -- cgit v1.2.1 From 17186ccda374ae02ef231cbbc8f1825e7c19ddbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:28 +0200 Subject: perf/x86/intel: Make WARN()ings consistent The intel_commit_scheduling() callback is pointlessly different from the start and stop scheduling callback. Furthermore, the constraint should never be NULL, so remove that test. Even though we'll never get called (because we NULL the callbacks) when !is_ht_workaround_enabled() put that test in. Collapse the (pointless) WARN_ON_ONCE() and bail on !cpuc->excl_cntrs -- this is doubly pointless, because its the same condition as is_ht_workaround_enabled() which was already pointless because the whole method won't ever be called. Furthremore, make all the !excl_cntrs test WARN_ON_ONCE(); they're all pointless, because the above, either the function ({get,put}_excl_constraint) are already predicated on it existing or the is_ht_workaround_enabled() thing is the same test. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9588b8d1264d..d7d30b41f6a3 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1915,7 +1915,7 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) /* * no exclusion needed */ - if (!excl_cntrs) + if (WARN_ON_ONCE(!excl_cntrs)) return; xl = &excl_cntrs->states[tid]; @@ -1949,7 +1949,7 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc) /* * no exclusion needed */ - if (!excl_cntrs) + if (WARN_ON_ONCE(!excl_cntrs)) return; xl = &excl_cntrs->states[tid]; @@ -1985,7 +1985,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, /* * no exclusion needed */ - if (!excl_cntrs) + if (WARN_ON_ONCE(!excl_cntrs)) return c; /* @@ -2126,9 +2126,7 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, if (cpuc->is_fake) return; - WARN_ON_ONCE(!excl_cntrs); - - if (!excl_cntrs) + if (WARN_ON_ONCE(!excl_cntrs)) return; xl = &excl_cntrs->states[tid]; @@ -2193,15 +2191,13 @@ static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cnt struct intel_excl_states *xl; int tid = cpuc->excl_thread_id; - if (cpuc->is_fake || !c) + if (cpuc->is_fake || !is_ht_workaround_enabled()) return; - if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) + if (WARN_ON_ONCE(!excl_cntrs)) return; - WARN_ON_ONCE(!excl_cntrs); - - if (!excl_cntrs) + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) return; xl = &excl_cntrs->states[tid]; -- cgit v1.2.1 From 0c41e756b9c5a9899b5cd238226600f8f34c9b82 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:32 +0200 Subject: perf/x86/intel: Clean up intel_commit_scheduling() placement Move the code of intel_commit_scheduling() to the right place, which is in between start() and stop(). No change in functionality. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 4 +-- arch/x86/kernel/cpu/perf_event_intel.c | 60 +++++++++++++++++----------------- 2 files changed, 32 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ef78516850fb..e5609522255c 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -527,10 +527,10 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); - void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); - void (*start_scheduling)(struct cpu_hw_events *cpuc); + void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); + void (*stop_scheduling)(struct cpu_hw_events *cpuc); struct event_constraint *event_constraints; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d7d30b41f6a3..ff56fc3f016e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1934,6 +1934,34 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) memcpy(xl->init_state, xl->state, sizeof(xl->init_state)); } +static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) +{ + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct event_constraint *c = cpuc->event_constraint[idx]; + struct intel_excl_states *xl; + int tid = cpuc->excl_thread_id; + + if (cpuc->is_fake || !is_ht_workaround_enabled()) + return; + + if (WARN_ON_ONCE(!excl_cntrs)) + return; + + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) + return; + + xl = &excl_cntrs->states[tid]; + + lockdep_assert_held(&excl_cntrs->lock); + + if (cntr >= 0) { + if (c->flags & PERF_X86_EVENT_EXCL) + xl->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; + else + xl->init_state[cntr] = INTEL_EXCL_SHARED; + } +} + static void intel_stop_scheduling(struct cpu_hw_events *cpuc) { @@ -2184,34 +2212,6 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_excl_constraints(cpuc, event); } -static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) -{ - struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct event_constraint *c = cpuc->event_constraint[idx]; - struct intel_excl_states *xl; - int tid = cpuc->excl_thread_id; - - if (cpuc->is_fake || !is_ht_workaround_enabled()) - return; - - if (WARN_ON_ONCE(!excl_cntrs)) - return; - - if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) - return; - - xl = &excl_cntrs->states[tid]; - - lockdep_assert_held(&excl_cntrs->lock); - - if (cntr >= 0) { - if (c->flags & PERF_X86_EVENT_EXCL) - xl->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; - else - xl->init_state[cntr] = INTEL_EXCL_SHARED; - } -} - static void intel_pebs_aliases_core2(struct perf_event *event) { if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { @@ -2920,8 +2920,8 @@ static __init void intel_ht_bug(void) { x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED; - x86_pmu.commit_scheduling = intel_commit_scheduling; x86_pmu.start_scheduling = intel_start_scheduling; + x86_pmu.commit_scheduling = intel_commit_scheduling; x86_pmu.stop_scheduling = intel_stop_scheduling; } @@ -3377,8 +3377,8 @@ static __init int fixup_ht_bug(void) x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); - x86_pmu.commit_scheduling = NULL; x86_pmu.start_scheduling = NULL; + x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; watchdog_nmi_enable_all(); -- cgit v1.2.1 From 1fe684e349e904adeed2883cfdeef259a21c94f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:36 +0200 Subject: perf/x86/intel: Remove pointless tests Both intel_commit_scheduling() and intel_get_excl_contraints() test for cntr < 0. The only way that can happen (aside from a bug) is through validate_event(), however that is already captured by the cpuc->is_fake test. So remove these test and simplify the code. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ff56fc3f016e..6a3e794cdc06 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1954,12 +1954,10 @@ static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cnt lockdep_assert_held(&excl_cntrs->lock); - if (cntr >= 0) { - if (c->flags & PERF_X86_EVENT_EXCL) - xl->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; - else - xl->init_state[cntr] = INTEL_EXCL_SHARED; - } + if (c->flags & PERF_X86_EVENT_EXCL) + xl->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; + else + xl->init_state[cntr] = INTEL_EXCL_SHARED; } static void @@ -2027,10 +2025,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { struct event_constraint *cx; - /* sanity check */ - if (idx < 0) - return &emptyconstraint; - /* * grab pre-allocated constraint entry */ -- cgit v1.2.1 From 43ef205bded025432f5eeeb3503c11fe5cd1913e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:39 +0200 Subject: perf/x86/intel: Remove intel_excl_states::init_state For some obscure reason intel_{start,stop}_scheduling() copy the HT state to an intermediate array. This would make sense if we ever were to make changes to it which we'd have to discard. Except we don't. By the time we call intel_commit_scheduling() we're; as the name implies; committed to them. We'll never back out. A further hint its pointless is that stop_scheduling() unconditionally publishes the state. So the intermediate array is pointless, modify the state in place and kill the extra array. And remove the pointless array initialization: INTEL_EXCL_UNUSED == 0. Note; all is serialized by intel_excl_cntr::lock. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 - arch/x86/kernel/cpu/perf_event.h | 1 - arch/x86/kernel/cpu/perf_event_intel.c | 22 ++-------------------- 3 files changed, 2 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4f7001f28936..d275da3d81dd 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -884,7 +884,6 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) } if (!assign || unsched) { - for (i = 0; i < n; i++) { e = cpuc->event_list[i]; /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index e5609522255c..89e6cd61e6ae 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -133,7 +133,6 @@ enum intel_excl_state_type { }; struct intel_excl_states { - enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; enum intel_excl_state_type state[X86_PMC_IDX_MAX]; bool sched_started; /* true if scheduling has started */ }; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6a3e794cdc06..f3201439031d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1927,11 +1927,6 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) * makes scheduling appear as a transaction */ raw_spin_lock(&excl_cntrs->lock); - - /* - * Save a copy of our state to work on. - */ - memcpy(xl->init_state, xl->state, sizeof(xl->init_state)); } static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) @@ -1955,9 +1950,9 @@ static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cnt lockdep_assert_held(&excl_cntrs->lock); if (c->flags & PERF_X86_EVENT_EXCL) - xl->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; + xl->state[cntr] = INTEL_EXCL_EXCLUSIVE; else - xl->init_state[cntr] = INTEL_EXCL_SHARED; + xl->state[cntr] = INTEL_EXCL_SHARED; } static void @@ -1980,11 +1975,6 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc) xl = &excl_cntrs->states[tid]; - /* - * Commit the working state. - */ - memcpy(xl->state, xl->init_state, sizeof(xl->state)); - xl->sched_started = false; /* * release shared state lock (acquired in intel_start_scheduling()) @@ -2519,19 +2509,11 @@ struct intel_shared_regs *allocate_shared_regs(int cpu) static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) { struct intel_excl_cntrs *c; - int i; c = kzalloc_node(sizeof(struct intel_excl_cntrs), GFP_KERNEL, cpu_to_node(cpu)); if (c) { raw_spin_lock_init(&c->lock); - for (i = 0; i < X86_PMC_IDX_MAX; i++) { - c->states[0].state[i] = INTEL_EXCL_UNUSED; - c->states[0].init_state[i] = INTEL_EXCL_UNUSED; - - c->states[1].state[i] = INTEL_EXCL_UNUSED; - c->states[1].init_state[i] = INTEL_EXCL_UNUSED; - } c->core_id = -1; } return c; -- cgit v1.2.1 From 8736e548db0f48244132bc36331496494625bbaf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:43 +0200 Subject: perf/x86: Simplify the x86_schedule_events() logic !x && y == ! (x || !y) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d275da3d81dd..dbe3328f8ad7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -881,9 +881,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (x86_pmu.commit_scheduling) x86_pmu.commit_scheduling(cpuc, i, assign[i]); } - } - - if (!assign || unsched) { + } else { for (i = 0; i < n; i++) { e = cpuc->event_list[i]; /* -- cgit v1.2.1 From ba040653b48d32afa8b6c3331eae97c6bbb66f03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 May 2015 11:36:13 +0200 Subject: perf/x86/intel: Simplify put_exclusive_constraints() Don't bother with taking locks if we're not actually going to do anything. Also, drop the _irqsave(), this is very much only called from IRQ-disabled context. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f3201439031d..74f19d9268bb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2130,7 +2130,6 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; int tid = cpuc->excl_thread_id; struct intel_excl_states *xl; - unsigned long flags = 0; /* keep compiler happy */ /* * nothing needed if in group validation mode @@ -2141,7 +2140,6 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, if (WARN_ON_ONCE(!excl_cntrs)) return; - xl = &excl_cntrs->states[tid]; if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; if (!--cpuc->n_excl) @@ -2149,22 +2147,25 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, } /* - * put_constraint may be called from x86_schedule_events() - * which already has the lock held so here make locking - * conditional + * If event was actually assigned, then mark the counter state as + * unused now. */ - if (!xl->sched_started) - raw_spin_lock_irqsave(&excl_cntrs->lock, flags); + if (hwc->idx >= 0) { + xl = &excl_cntrs->states[tid]; + + /* + * put_constraint may be called from x86_schedule_events() + * which already has the lock held so here make locking + * conditional. + */ + if (!xl->sched_started) + raw_spin_lock(&excl_cntrs->lock); - /* - * if event was actually assigned, then mark the - * counter state as unused now - */ - if (hwc->idx >= 0) xl->state[hwc->idx] = INTEL_EXCL_UNUSED; - if (!xl->sched_started) - raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags); + if (!xl->sched_started) + raw_spin_unlock(&excl_cntrs->lock); + } } static void -- cgit v1.2.1 From 74387bcb711b7b2ed65c0ed08953e13d4e31969e Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 21 Apr 2015 16:16:13 +0300 Subject: perf/x86/intel/pt: Kill an unused variable Currently, there's a set-but-not-used variable in setup_topa_index(); this patch gets rid of it. And while at it, fixes a style issue with brackets around a one-line block. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1429622177-22843-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 5b804f96ad66..8a4595650d6e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -674,7 +674,7 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf) struct topa *cur = buf->first, *prev = buf->last; struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), *te_prev = TOPA_ENTRY(prev, prev->last - 1); - int pg = 0, idx = 0, ntopa = 0; + int pg = 0, idx = 0; while (pg < buf->nr_pages) { int tidx; @@ -689,9 +689,9 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf) /* advance to next topa table */ idx = 0; cur = list_entry(cur->list.next, struct topa, list); - ntopa++; - } else + } else { idx++; + } te_cur = TOPA_ENTRY(cur, idx); } -- cgit v1.2.1 From cf302bfdf3039853fce812ae1ffd0ac24f5b468f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 21 Apr 2015 16:16:15 +0300 Subject: perf/x86/intel/pt: Document pt_buffer_reset_markers() The comments in the driver don't make it absolutely clear as to what exactly is the calling order and other possible constraints of buffer management functions. Document constraints and calling order for the buffer configuration functions. While at it, replace a redundant check in pt_buffer_reset_markers() with an explanation why it is not needed. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1429622177-22843-4-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 8a4595650d6e..b2746eafa0cd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -609,7 +609,12 @@ static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) * @handle: Current output handle. * * Place INT and STOP marks to prevent overwriting old data that the consumer - * hasn't yet collected. + * hasn't yet collected and waking up the consumer after a certain fraction of + * the buffer has filled up. Only needed and sensible for non-snapshot counters. + * + * This obviously relies on buf::head to figure out buffer markers, so it has + * to be called after pt_buffer_reset_offsets() and before the hardware tracing + * is enabled. */ static int pt_buffer_reset_markers(struct pt_buffer *buf, struct perf_output_handle *handle) @@ -618,9 +623,6 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, unsigned long head = local64_read(&buf->head); unsigned long idx, npages, wakeup; - if (buf->snapshot) - return 0; - /* can't stop in the middle of an output region */ if (buf->output_off + handle->size + 1 < sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) @@ -901,6 +903,7 @@ void intel_pt_interrupt(void) } pt_buffer_reset_offsets(buf, pt->handle.head); + /* snapshot counters don't use PMI, so it's safe */ ret = pt_buffer_reset_markers(buf, &pt->handle); if (ret) { perf_aux_output_end(&pt->handle, 0, true); -- cgit v1.2.1 From 5b1dbd17c0dee679b154ce47f534677b7e0f7ad6 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 21 Apr 2015 16:16:16 +0300 Subject: perf/x86/intel/pt: Document pt_buffer_reset_offsets() Currently, the description of pt_buffer_reset_offsets() lacks information about its calling constraints and ordering with regards to other buffer management functions. Add a clarification about when this function has to be called. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1429622177-22843-5-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index b2746eafa0cd..40ba5e4312d4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -705,7 +705,14 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf) * @head: Write pointer (aux_head) from AUX buffer. * * Find the ToPA table and entry corresponding to given @head and set buffer's - * "current" pointers accordingly. + * "current" pointers accordingly. This is done after we have obtained the + * current aux_head position from a successful call to perf_aux_output_begin() + * to make sure the hardware is writing to the right place. + * + * This function modifies buf::{cur,cur_idx,output_off} that will be programmed + * into PT msrs when the tracing is enabled and buf::head and buf::data_size, + * which are used to determine INT and STOP markers' locations by a subsequent + * call to pt_buffer_reset_markers(). */ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) { -- cgit v1.2.1 From 0a487aad2dfd088bcbbe1766944280b40ff969a5 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 21 Apr 2015 16:16:17 +0300 Subject: perf/x86/intel/pt: Kill pt_is_running() Initially, we were trying to guard against scenarios where somebody attaches to the system with a hardware debugger while PT is enabled from software and pt_is_running() tries to make sure we handle this better, but the truth is, there is still a race window no matter what and people with hardware debuggers should really know what they are doing anyway. In other words, there is no point in keeping this one around, and it's one RDMSR instructions fewer in the fast path. The case when PT is enabled by the BIOS at boot time is handled in the driver initialization path and doesn't use pt_is_running(). This patch gets rid of it. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1429622177-22843-6-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 40ba5e4312d4..a2d407172d61 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -187,15 +187,6 @@ static bool pt_event_valid(struct perf_event *event) * These all are cpu affine and operate on a local PT */ -static bool pt_is_running(void) -{ - u64 ctl; - - rdmsrl(MSR_IA32_RTIT_CTL, ctl); - - return !!(ctl & RTIT_CTL_TRACEEN); -} - static void pt_config(struct perf_event *event) { u64 reg; @@ -933,7 +924,7 @@ static void pt_event_start(struct perf_event *event, int mode) struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt_buffer *buf = perf_get_aux(&pt->handle); - if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) { + if (!buf || pt_buffer_is_full(buf, pt)) { event->hw.state = PERF_HES_STOPPED; return; } -- cgit v1.2.1 From a82d24edfeaf1ed244cf8b969916840c6feb5165 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 22 May 2015 18:30:26 +0300 Subject: perf/x86/intel/pt: Remove redundant variable declaration There is a 'pt' variable in the outer scope of pt_event_stop() with the same type, we don't really need another one in the inner scope. This patch removes the redundant variable declaration. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1432308626-18845-8-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index a2d407172d61..59596d25cb29 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -955,7 +955,6 @@ static void pt_event_stop(struct perf_event *event, int mode) event->hw.state = PERF_HES_STOPPED; if (mode & PERF_EF_UPDATE) { - struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt_buffer *buf = perf_get_aux(&pt->handle); if (!buf) -- cgit v1.2.1 From 47f01e8cc23f3d041f6b9fb97627369eaf75ba7f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2015 12:22:29 +0200 Subject: x86/fpu: Fix FPU register read access to the current task Bobby Powers reported the following FPU warning during ELF coredumping: WARNING: CPU: 0 PID: 27452 at arch/x86/kernel/fpu/core.c:324 fpu__activate_stopped+0x8a/0xa0() This warning unearthed an invalid assumption about fpu__activate_stopped() that I added in: 67e97fc2ec57 ("x86/fpu: Rename init_fpu() to fpu__unlazy_stopped() and add debugging check") the old init_fpu() function had an (intentional but obscure) side effect: when FPU registers are accessed for the current task, for reading, then it synchronized live in-register FPU state with the fpstate by saving it. So fix this bug by saving the FPU if we are the current task. We'll still warn in fpu__save() if this is called for not yet stopped child tasks, so the debugging check is still preserved. Also rename the function to fpu__activate_fpstate(), because it's not exclusively used for stopped tasks, but for the current task as well. ( Note that this bug calls for a cleaner separation of access-for-read and access-for-modification FPU methods, but we'll do that in separate patches. ) Reported-by: Bobby Powers Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 43 +++++++++++++++++++++---------------- arch/x86/kernel/fpu/regset.c | 12 +++++------ 3 files changed, 32 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 99690bed920a..62d13d515f95 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -22,7 +22,7 @@ * High level FPU state handling functions: */ extern void fpu__activate_curr(struct fpu *fpu); -extern void fpu__activate_stopped(struct fpu *fpu); +extern void fpu__activate_fpstate(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 01a15503c3be..b41049247cfa 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -296,40 +296,47 @@ void fpu__activate_curr(struct fpu *fpu) EXPORT_SYMBOL_GPL(fpu__activate_curr); /* - * This function must be called before we modify a stopped child's - * fpstate. + * This function must be called before we read or write a task's fpstate. * - * If the child has not used the FPU before then initialize its + * If the task has not used the FPU before then initialize its * fpstate. * - * If the child has used the FPU before then unlazy it. + * If the task has used the FPU before then save and unlazy it. * - * [ After this function call, after registers in the fpstate are + * [ If this function is used for non-current child tasks, then + * after this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will * restore the modified FPU state from the modified context. If we * didn't clear its lazy status here then the lazy in-registers * state pending on its former CPU could be restored, corrupting - * the modifications. ] + * the modifications. * - * This function is also called before we read a stopped child's - * FPU state - to make sure it's initialized if the child has - * no active FPU state. + * This function can be used for the current task as well, but + * only for reading the fpstate. Modifications to the fpstate + * will be lost on eagerfpu systems. ] * * TODO: A future optimization would be to skip the unlazying in * the read-only case, it's not strictly necessary for * read-only access to the context. */ -void fpu__activate_stopped(struct fpu *child_fpu) +void fpu__activate_fpstate(struct fpu *fpu) { - WARN_ON_FPU(child_fpu == ¤t->thread.fpu); - - if (child_fpu->fpstate_active) { - child_fpu->last_cpu = -1; + /* + * If fpregs are active (in the current CPU), then + * copy them to the fpstate: + */ + if (fpu->fpregs_active) { + fpu__save(fpu); } else { - fpstate_init(&child_fpu->state); - - /* Safe to do for stopped child tasks: */ - child_fpu->fpstate_active = 1; + if (fpu->fpstate_active) { + /* Invalidate any lazy state: */ + fpu->last_cpu = -1; + } else { + fpstate_init(&fpu->state); + + /* Safe to do for current and for stopped child tasks: */ + fpu->fpstate_active = 1; + } } } diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 297b3da8e4c4..a1f97d9d6a45 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -33,7 +33,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - fpu__activate_stopped(fpu); + fpu__activate_fpstate(fpu); fpstate_sanitize_xstate(fpu); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -50,7 +50,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - fpu__activate_stopped(fpu); + fpu__activate_fpstate(fpu); fpstate_sanitize_xstate(fpu); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -82,7 +82,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - fpu__activate_stopped(fpu); + fpu__activate_fpstate(fpu); xsave = &fpu->state.xsave; @@ -111,7 +111,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - fpu__activate_stopped(fpu); + fpu__activate_fpstate(fpu); xsave = &fpu->state.xsave; @@ -273,7 +273,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - fpu__activate_stopped(fpu); + fpu__activate_fpstate(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); @@ -303,7 +303,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__activate_stopped(fpu); + fpu__activate_fpstate(fpu); fpstate_sanitize_xstate(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) -- cgit v1.2.1 From 0560281266b313400b622c5ddfafb0ee8e59c702 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2015 12:22:29 +0200 Subject: x86/fpu: Split out the fpu__activate_fpstate_read() method Currently fpu__activate_fpstate() is used for two distinct purposes: - read access by ptrace and core dumping, where in the core dumping case the current task's FPU state may be examined as well. - write access by ptrace, which modifies FPU registers and expects the modified registers to be reloaded on the next context switch. Split out the reading side into fpu__activate_fpstate_read(). ( Note that this is just a pure duplication of fpu__activate_fpstate() for the time being, we'll optimize the new function in the next patch. ) Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 1 + arch/x86/kernel/fpu/core.c | 29 +++++++++++++++++++++++++++++ arch/x86/kernel/fpu/regset.c | 6 +++--- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 62d13d515f95..3cc2086b97f8 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -22,6 +22,7 @@ * High level FPU state handling functions: */ extern void fpu__activate_curr(struct fpu *fpu); +extern void fpu__activate_fpstate_read(struct fpu *fpu); extern void fpu__activate_fpstate(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b41049247cfa..174add372bb8 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -295,6 +295,35 @@ void fpu__activate_curr(struct fpu *fpu) } EXPORT_SYMBOL_GPL(fpu__activate_curr); +/* + * This function must be called before we read a task's fpstate. + * + * If the task has not used the FPU before then initialize its + * fpstate. + * + * If the task has used the FPU before then save it. + */ +void fpu__activate_fpstate_read(struct fpu *fpu) +{ + /* + * If fpregs are active (in the current CPU), then + * copy them to the fpstate: + */ + if (fpu->fpregs_active) { + fpu__save(fpu); + } else { + if (fpu->fpstate_active) { + /* Invalidate any lazy state: */ + fpu->last_cpu = -1; + } else { + fpstate_init(&fpu->state); + + /* Safe to do for current and for stopped child tasks: */ + fpu->fpstate_active = 1; + } + } +} + /* * This function must be called before we read or write a task's fpstate. * diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index a1f97d9d6a45..4e40585a9c8f 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -33,7 +33,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - fpu__activate_fpstate(fpu); + fpu__activate_fpstate_read(fpu); fpstate_sanitize_xstate(fpu); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -82,7 +82,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - fpu__activate_fpstate(fpu); + fpu__activate_fpstate_read(fpu); xsave = &fpu->state.xsave; @@ -273,7 +273,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - fpu__activate_fpstate(fpu); + fpu__activate_fpstate_read(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); -- cgit v1.2.1 From 9ba6b79102a594293c79f30319cabf476c5e300e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2015 12:22:29 +0200 Subject: x86/fpu: Optimize fpu__activate_fpstate_read() fpu__activate_fpstate_read() is used before FPU registers are read from the fpstate by ptrace and core dumping. It's not necessary to unlazy non-current child tasks in this case, since the reading of registers is non-destructive. Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 174add372bb8..06cb7e3e9886 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -312,10 +312,7 @@ void fpu__activate_fpstate_read(struct fpu *fpu) if (fpu->fpregs_active) { fpu__save(fpu); } else { - if (fpu->fpstate_active) { - /* Invalidate any lazy state: */ - fpu->last_cpu = -1; - } else { + if (!fpu->fpstate_active) { fpstate_init(&fpu->state); /* Safe to do for current and for stopped child tasks: */ -- cgit v1.2.1 From 6a81d7eb330479c908dab3a47ac33cfca8af5a67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2015 12:22:29 +0200 Subject: x86/fpu: Rename fpu__activate_fpstate() to fpu__activate_fpstate_write() Remaining users of fpu__activate_fpstate() are all places that want to modify FPU registers, rename the function to fpu__activate_fpstate_write() according to this usage. Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/regset.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3cc2086b97f8..e3bd93c84928 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -23,7 +23,7 @@ */ extern void fpu__activate_curr(struct fpu *fpu); extern void fpu__activate_fpstate_read(struct fpu *fpu); -extern void fpu__activate_fpstate(struct fpu *fpu); +extern void fpu__activate_fpstate_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 06cb7e3e9886..6b0955a62d34 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -345,7 +345,7 @@ void fpu__activate_fpstate_read(struct fpu *fpu) * the read-only case, it's not strictly necessary for * read-only access to the context. */ -void fpu__activate_fpstate(struct fpu *fpu) +void fpu__activate_fpstate_write(struct fpu *fpu) { /* * If fpregs are active (in the current CPU), then diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 4e40585a9c8f..dc60810c1c74 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -50,7 +50,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - fpu__activate_fpstate(fpu); + fpu__activate_fpstate_write(fpu); fpstate_sanitize_xstate(fpu); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -111,7 +111,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_xsave) return -ENODEV; - fpu__activate_fpstate(fpu); + fpu__activate_fpstate_write(fpu); xsave = &fpu->state.xsave; @@ -303,7 +303,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__activate_fpstate(fpu); + fpu__activate_fpstate_write(fpu); fpstate_sanitize_xstate(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) -- cgit v1.2.1 From 343763c3b0a4fa2d29f7c3ba405abf8771b90876 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2015 12:22:29 +0200 Subject: x86/fpu: Optimize fpu__activate_fpstate_write() fpu__activate_fpstate_write() is used before ptrace writes to the fpstate context. Because it expects the modified registers to be reloaded on the nexts context switch, it's only valid to call this function for stopped child tasks. - add a debugging check for this assumption - remove code that only runs if the current task's FPU state needs to be saved, which cannot occur here - update comments to match the implementation Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 51 +++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 6b0955a62d34..86a9a9a086fa 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -322,47 +322,34 @@ void fpu__activate_fpstate_read(struct fpu *fpu) } /* - * This function must be called before we read or write a task's fpstate. + * This function must be called before we write a task's fpstate. * - * If the task has not used the FPU before then initialize its - * fpstate. - * - * If the task has used the FPU before then save and unlazy it. - * - * [ If this function is used for non-current child tasks, then - * after this function call, after registers in the fpstate are - * modified and the child task has woken up, the child task will - * restore the modified FPU state from the modified context. If we - * didn't clear its lazy status here then the lazy in-registers - * state pending on its former CPU could be restored, corrupting - * the modifications. + * If the task has used the FPU before then unlazy it. + * If the task has not used the FPU before then initialize its fpstate. * - * This function can be used for the current task as well, but - * only for reading the fpstate. Modifications to the fpstate - * will be lost on eagerfpu systems. ] - * - * TODO: A future optimization would be to skip the unlazying in - * the read-only case, it's not strictly necessary for - * read-only access to the context. + * After this function call, after registers in the fpstate are + * modified and the child task has woken up, the child task will + * restore the modified FPU state from the modified context. If we + * didn't clear its lazy status here then the lazy in-registers + * state pending on its former CPU could be restored, corrupting + * the modifications. */ void fpu__activate_fpstate_write(struct fpu *fpu) { /* - * If fpregs are active (in the current CPU), then - * copy them to the fpstate: + * Only stopped child tasks can be used to modify the FPU + * state in the fpstate buffer: */ - if (fpu->fpregs_active) { - fpu__save(fpu); + WARN_ON_FPU(fpu == ¤t->thread.fpu); + + if (fpu->fpstate_active) { + /* Invalidate any lazy state: */ + fpu->last_cpu = -1; } else { - if (fpu->fpstate_active) { - /* Invalidate any lazy state: */ - fpu->last_cpu = -1; - } else { - fpstate_init(&fpu->state); + fpstate_init(&fpu->state); - /* Safe to do for current and for stopped child tasks: */ - fpu->fpstate_active = 1; - } + /* Safe to do for stopped child tasks: */ + fpu->fpstate_active = 1; } } -- cgit v1.2.1 From ce2a1e67f1738535b011a7b4bd42cc114b1d805f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2015 10:57:06 +0200 Subject: x86/fpu: Add debugging check to fpu__restore() The copy_fpstate_to_fpregs() function is never supposed to fail, so add a debugging check to its call site in fpu__restore(). Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 86a9a9a086fa..874ef1701750 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -371,6 +371,8 @@ void fpu__restore(struct fpu *fpu) kernel_fpu_disable(); fpregs_activate(fpu); if (unlikely(copy_fpstate_to_fpregs(fpu))) { + /* Copying the kernel state to FPU registers should never fail: */ + WARN_ON_FPU(1); fpu__clear(fpu); force_sig_info(SIGSEGV, SEND_SIG_PRIV, current); } else { -- cgit v1.2.1 From 43b287b3f4d8665cd5a4909132259b663cc1c0e3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2015 10:59:31 +0200 Subject: x86/fpu: Add debugging checks to all copy_kernel_to_*() functions Copying from in-kernel FPU context buffers to FPU registers are never supposed to fault. Add debugging checks to copy_kernel_to_fxregs() and copy_kernel_to_fregs() to double check this assumption. Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index e3bd93c84928..eb8fa0f9d279 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -143,14 +143,22 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) static inline int copy_kernel_to_fxregs(struct fxregs_state *fx) { - if (config_enabled(CONFIG_X86_32)) - return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + int err; - /* See comment in copy_fxregs_to_kernel() below. */ - return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), - "m" (*fx)); + if (config_enabled(CONFIG_X86_32)) { + err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + } else { + if (config_enabled(CONFIG_AS_FXSAVEQ)) { + err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + } else { + /* See comment in copy_fxregs_to_kernel() below. */ + err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); + } + } + /* Copying from a kernel buffer to FPU registers should never fail: */ + WARN_ON_FPU(err); + + return err; } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) @@ -167,7 +175,11 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) static inline int copy_kernel_to_fregs(struct fregs_state *fx) { - return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + + WARN_ON_FPU(err); + + return err; } static inline int copy_user_to_fregs(struct fregs_state __user *fx) -- cgit v1.2.1 From 3e1bf47e5c81c2b895db4bea67f70c3ca8e5b984 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2015 13:47:01 +0200 Subject: x86/fpu: Rename copy_fpstate_to_fpregs() to copy_kernel_to_fpregs() Bring the __copy_fpstate_to_fpregs() and copy_fpstate_to_fpregs() functions in line with the naming of other kernel-to-FPU-registers copying functions. Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index eb8fa0f9d279..6193b7a9cf00 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -450,7 +450,7 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -static inline int __copy_fpstate_to_fpregs(struct fpu *fpu) +static inline int __copy_kernel_to_fpregs(struct fpu *fpu) { if (use_xsave()) { copy_kernel_to_xregs(&fpu->state.xsave, -1); @@ -463,7 +463,7 @@ static inline int __copy_fpstate_to_fpregs(struct fpu *fpu) } } -static inline int copy_fpstate_to_fpregs(struct fpu *fpu) +static inline int copy_kernel_to_fpregs(struct fpu *fpu) { /* * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is @@ -478,7 +478,7 @@ static inline int copy_fpstate_to_fpregs(struct fpu *fpu) : : [addr] "m" (fpu->fpregs_active)); } - return __copy_fpstate_to_fpregs(fpu); + return __copy_kernel_to_fpregs(fpu); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); @@ -647,7 +647,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) { if (fpu_switch.preload) { - if (unlikely(copy_fpstate_to_fpregs(new_fpu))) { + if (unlikely(copy_kernel_to_fpregs(new_fpu))) { WARN_ON_FPU(1); fpu__clear(new_fpu); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 874ef1701750..e0e0ee565dc3 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -127,7 +127,7 @@ void __kernel_fpu_end(void) struct fpu *fpu = ¤t->thread.fpu; if (fpu->fpregs_active) { - if (WARN_ON_FPU(copy_fpstate_to_fpregs(fpu))) + if (WARN_ON_FPU(copy_kernel_to_fpregs(fpu))) fpu__clear(fpu); } else { __fpregs_deactivate_hw(); @@ -370,7 +370,7 @@ void fpu__restore(struct fpu *fpu) /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); fpregs_activate(fpu); - if (unlikely(copy_fpstate_to_fpregs(fpu))) { + if (unlikely(copy_kernel_to_fpregs(fpu))) { /* Copying the kernel state to FPU registers should never fail: */ WARN_ON_FPU(1); fpu__clear(fpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 989cfc01e2a5..66871f4937fa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7030,7 +7030,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); - __copy_fpstate_to_fpregs(&vcpu->arch.guest_fpu); + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } -- cgit v1.2.1 From 9ccc27a5d297503e485373b69688d038a1d8e662 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2015 11:27:46 +0200 Subject: x86/fpu: Remove error return values from copy_kernel_to_*regs() functions None of the copy_kernel_to_*regs() FPU register copying functions are supposed to fail, and all of them have debugging checks that enforce this. Remove their return values and simplify their call sites, which have redundant error checks and error handling code paths. Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 27 +++++++++------------------ arch/x86/kernel/fpu/core.c | 18 +++++------------- 2 files changed, 14 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 6193b7a9cf00..da71d41227ff 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -141,7 +141,7 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); } -static inline int copy_kernel_to_fxregs(struct fxregs_state *fx) +static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { int err; @@ -157,8 +157,6 @@ static inline int copy_kernel_to_fxregs(struct fxregs_state *fx) } /* Copying from a kernel buffer to FPU registers should never fail: */ WARN_ON_FPU(err); - - return err; } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) @@ -173,13 +171,11 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) "m" (*fx)); } -static inline int copy_kernel_to_fregs(struct fregs_state *fx) +static inline void copy_kernel_to_fregs(struct fregs_state *fx) { int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); WARN_ON_FPU(err); - - return err; } static inline int copy_user_to_fregs(struct fregs_state __user *fx) @@ -450,20 +446,19 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -static inline int __copy_kernel_to_fpregs(struct fpu *fpu) +static inline void __copy_kernel_to_fpregs(struct fpu *fpu) { if (use_xsave()) { copy_kernel_to_xregs(&fpu->state.xsave, -1); - return 0; } else { if (use_fxsr()) - return copy_kernel_to_fxregs(&fpu->state.fxsave); + copy_kernel_to_fxregs(&fpu->state.fxsave); else - return copy_kernel_to_fregs(&fpu->state.fsave); + copy_kernel_to_fregs(&fpu->state.fsave); } } -static inline int copy_kernel_to_fpregs(struct fpu *fpu) +static inline void copy_kernel_to_fpregs(struct fpu *fpu) { /* * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is @@ -478,7 +473,7 @@ static inline int copy_kernel_to_fpregs(struct fpu *fpu) : : [addr] "m" (fpu->fpregs_active)); } - return __copy_kernel_to_fpregs(fpu); + __copy_kernel_to_fpregs(fpu); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); @@ -646,12 +641,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) */ static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) { - if (fpu_switch.preload) { - if (unlikely(copy_kernel_to_fpregs(new_fpu))) { - WARN_ON_FPU(1); - fpu__clear(new_fpu); - } - } + if (fpu_switch.preload) + copy_kernel_to_fpregs(new_fpu); } /* diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e0e0ee565dc3..8470df44c06d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -126,12 +126,10 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpregs_active) { - if (WARN_ON_FPU(copy_kernel_to_fpregs(fpu))) - fpu__clear(fpu); - } else { + if (fpu->fpregs_active) + copy_kernel_to_fpregs(fpu); + else __fpregs_deactivate_hw(); - } kernel_fpu_enable(); } @@ -370,14 +368,8 @@ void fpu__restore(struct fpu *fpu) /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); fpregs_activate(fpu); - if (unlikely(copy_kernel_to_fpregs(fpu))) { - /* Copying the kernel state to FPU registers should never fail: */ - WARN_ON_FPU(1); - fpu__clear(fpu); - force_sig_info(SIGSEGV, SEND_SIG_PRIV, current); - } else { - fpu->counter++; - } + copy_kernel_to_fpregs(fpu); + fpu->counter++; kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); -- cgit v1.2.1 From 003e2e8b57e79709e4973f6cb48381b2ba396680 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2015 11:59:35 +0200 Subject: x86/fpu: Standardize the parameter type of copy_kernel_to_fpregs() Bring the __copy_fpstate_to_fpregs() and copy_fpstate_to_fpregs() functions in line with the parameter passing convention of other kernel-to-FPU-registers copying functions: pass around an in-memory FPU register state pointer, instead of struct fpu *. NOTE: This patch also changes the assembly constraint of the FXSAVE-leak workaround from 'fpu->fpregs_active' to 'fpstate' - but that is fine, as we only need a valid memory address there for the FILDL instruction. Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 16 ++++++++-------- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index da71d41227ff..12acbb32a561 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -446,19 +446,19 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -static inline void __copy_kernel_to_fpregs(struct fpu *fpu) +static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate) { if (use_xsave()) { - copy_kernel_to_xregs(&fpu->state.xsave, -1); + copy_kernel_to_xregs(&fpstate->xsave, -1); } else { if (use_fxsr()) - copy_kernel_to_fxregs(&fpu->state.fxsave); + copy_kernel_to_fxregs(&fpstate->fxsave); else - copy_kernel_to_fregs(&fpu->state.fsave); + copy_kernel_to_fregs(&fpstate->fsave); } } -static inline void copy_kernel_to_fpregs(struct fpu *fpu) +static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) { /* * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is @@ -470,10 +470,10 @@ static inline void copy_kernel_to_fpregs(struct fpu *fpu) "fnclex\n\t" "emms\n\t" "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (fpu->fpregs_active)); + : : [addr] "m" (fpstate)); } - __copy_kernel_to_fpregs(fpu); + __copy_kernel_to_fpregs(fpstate); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); @@ -642,7 +642,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) { if (fpu_switch.preload) - copy_kernel_to_fpregs(new_fpu); + copy_kernel_to_fpregs(&new_fpu->state); } /* diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8470df44c06d..79de954626fd 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -127,7 +127,7 @@ void __kernel_fpu_end(void) struct fpu *fpu = ¤t->thread.fpu; if (fpu->fpregs_active) - copy_kernel_to_fpregs(fpu); + copy_kernel_to_fpregs(&fpu->state); else __fpregs_deactivate_hw(); @@ -368,7 +368,7 @@ void fpu__restore(struct fpu *fpu) /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); fpregs_activate(fpu); - copy_kernel_to_fpregs(fpu); + copy_kernel_to_fpregs(&fpu->state); fpu->counter++; kernel_fpu_enable(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 66871f4937fa..26eaeb522cab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7030,7 +7030,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu); + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); trace_kvm_fpu(1); } -- cgit v1.2.1 From d65fcd608f6a9ac0316bd3efc75956ca329b6571 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2015 14:04:44 +0200 Subject: x86/fpu: Simplify copy_kernel_to_xregs_booting() copy_kernel_to_xregs_booting() has a second parameter that is the mask of xfeatures that should be copied - but this parameter is always -1. Simplify the call site of this function, this also makes it more similar to the function call signature of other copy_kernel_to*regs() functions. Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 3 ++- arch/x86/kernel/fpu/xstate.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 12acbb32a561..9f3863872007 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -271,8 +271,9 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate, u64 mask) +static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) { + u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; int err = 0; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index ad4dd26ebd59..a580eb5c7e52 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -281,7 +281,7 @@ static void __init setup_init_fpu_buf(void) /* * Init all the features state with header_bv being 0x0 */ - copy_kernel_to_xregs_booting(&init_fpstate.xsave, -1); + copy_kernel_to_xregs_booting(&init_fpstate.xsave); /* * Dump the init state again. This is to identify the init state -- cgit v1.2.1 From 83242c515881cc8642d726c3e648e41bf6c24551 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2015 12:22:29 +0200 Subject: x86/fpu: Make WARN_ON_FPU() more robust in the !CONFIG_X86_DEBUG_FPU case Make sure the WARN_ON_FPU() macro consumes the macro argument, to avoid 'unused variable' build warnings if the only use of a variable is in debugging code. Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 9f3863872007..3c3550c3a4a3 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -49,7 +49,7 @@ extern void fpu__resume_cpu(void); #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else -# define WARN_ON_FPU(x) ({ 0; }) +# define WARN_ON_FPU(x) ({ (void)(x); 0; }) #endif /* -- cgit v1.2.1 From 0fb0328d3458ff2d6ffbb280b75053c99a8a4b1f Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 26 May 2015 10:28:09 +0200 Subject: sched/x86: Drop repeated word from mwait_idle() comment A single "default" is fine. Signed-off-by: Huang Rui [ Fix another typo and reflow comment. ] Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1432022472-2224-5-git-send-email-ray.huang@amd.com Link: http://lkml.kernel.org/r/1432628901-18044-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6e338e3b1dc0..c648139d68d7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -445,11 +445,10 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) } /* - * MONITOR/MWAIT with no hints, used for default default C1 state. - * This invokes MWAIT with interrutps enabled and no flags, - * which is backwards compatible with the original MWAIT implementation. + * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT + * with interrupts enabled and no flags, which is backwards compatible with the + * original MWAIT implementation. */ - static void mwait_idle(void) { if (!current_set_polling_and_test()) { -- cgit v1.2.1 From adafb98da6a7af5e45362933a7dae6ab0e5076bf Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 26 May 2015 10:28:17 +0200 Subject: x86/cpu: Strip any /proc/cpuinfo model name field whitespace When comparing the 'model name' field of each core in /proc/cpuinfo it was noticed that there is a whitespace difference between the cores' model names. After some quick investigation it was noticed that the model name fields were actually different -- processor 0's model name field had trailing whitespace removed, while the other processors did not. Another way of seeing this behaviour is to convert spaces into underscores in the output of /proc/cpuinfo, [thetango@prarit ~]# grep "^model name" /proc/cpuinfo | uniq -c | sed 's/\ /_/g' ______1_model_name :_AMD_Opteron(TM)_Processor_6272 _____63_model_name :_AMD_Opteron(TM)_Processor_6272_________________ which shows the discrepancy. This occurs because the kernel calls strim() on cpu 0's x86_model_id field to output a pretty message to the console in print_cpu_info(), and as a result strips the whitespace at the end of the ->x86_model_id field. But, the ->x86_model_id field should be the same for the all identical CPUs in the box. Thus, we need to remove both leading and trailing whitespace. As a result, the print_cpu_info() output looks like smpboot: CPU0: AMD Opteron(TM) Processor 6272 (fam: 15, model: 01, stepping: 02) and the x86_model_id field is correct on all processors on AMD platforms: _____64_model_name :_AMD_Opteron(TM)_Processor_6272 Output is still correct on an Intel box: ____144_model_name :_Intel(R)_Xeon(R)_CPU_E7-8890_v3_@_2.50GHz Signed-off-by: Prarit Bhargava Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Igor Mammedov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1432050210-32036-1-git-send-email-prarit@redhat.com Link: http://lkml.kernel.org/r/1432628901-18044-15-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a62cf04dac8a..41a8e9cb30bc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -419,7 +419,6 @@ static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; - char *p, *q; if (c->extended_cpuid_level < 0x80000004) return; @@ -431,18 +430,10 @@ static void get_model_name(struct cpuinfo_x86 *c) c->x86_model_id[48] = 0; /* - * Intel chips right-justify this string for some dumb reason; - * undo that brain damage: + * Remove leading whitespace on Intel processors and trailing + * whitespace on AMD processors. */ - p = q = &c->x86_model_id[0]; - while (*p == ' ') - p++; - if (p != q) { - while (*p) - *q++ = *p++; - while (q <= &c->x86_model_id[48]) - *q++ = '\0'; /* Zero-pad the rest */ - } + memmove(c->x86_model_id, strim(c->x86_model_id), 48); } void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) @@ -1122,7 +1113,7 @@ void print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT "%s ", vendor); if (c->x86_model_id[0]) - printk(KERN_CONT "%s", strim(c->x86_model_id)); + printk(KERN_CONT "%s", c->x86_model_id); else printk(KERN_CONT "%d86", c->x86); -- cgit v1.2.1 From 5c31b2800d8d3e735e5ecac8fc13d1cf862fd330 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 26 May 2015 10:28:21 +0200 Subject: x86/mce: Fix monarch timeout setting through the mce= cmdline option Using "mce=1,10000000" on the kernel cmdline to change the monarch timeout does not work. The cause is that get_option() does parse a subsequent comma in the option string and signals that with a return value. So we don't need to check for a second comma ourselves. Signed-off-by: Xie XiuQi Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1432120943-25028-1-git-send-email-xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1432628901-18044-19-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 521e5016aca6..0cbcd3183acf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2014,11 +2014,8 @@ static int __init mcheck_enable(char *str) else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &(cfg->tolerant)); - if (*str == ',') { - ++str; + if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); - } } else { pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; -- cgit v1.2.1 From 10455f64aff0d715dcdfb09b02393df168fe267e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:04 +0200 Subject: x86/mm/kconfig: Simplify conditions for HAVE_ARCH_HUGE_VMAP Simplify the conditions selecting HAVE_ARCH_HUGE_VMAP since X86_PAE depends on X86_32 already. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-2-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..4eb0b0ffae85 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,7 +100,7 @@ config X86 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE) + select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG -- cgit v1.2.1 From 7f0431e3dc8953f41e9433581c1fdd7ee45860b0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:05 +0200 Subject: x86/mm/mtrr: Fix MTRR lookup to handle an inclusive entry When an MTRR entry is inclusive to a requested range, i.e. the start and end of the request are not within the MTRR entry range but the range contains the MTRR entry entirely: range_start ... [mtrr_start ... mtrr_end] ... range_end __mtrr_type_lookup() ignores such a case because both start_state and end_state are set to zero. This bug can cause the following issues: 1) reserve_memtype() tracks an effective memory type in case a request type is WB (ex. /dev/mem blindly uses WB). Missing to track with its effective type causes a subsequent request to map the same range with the effective type to fail. 2) pud_set_huge() and pmd_set_huge() check if a requested range has any overlap with MTRRs. Missing to detect an overlap may cause a performance penalty or undefined behavior. This patch fixes the bug by adding a new flag, 'inclusive', to detect the inclusive case. This case is then handled in the same way as end_state:1 since the first region is the same. With this fix, __mtrr_type_lookup() handles the inclusive case properly. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-3-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 5b239679cfc9..e202d26f64a2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -154,7 +154,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; + unsigned short start_state, end_state, inclusive; if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) continue; @@ -166,19 +166,27 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); + inclusive = ((start < base) && (end > base)); - if (start_state != end_state) { + if ((start_state != end_state) || inclusive) { /* * We have start:end spanning across an MTRR. - * We split the region into - * either - * (start:mtrr_end) (mtrr_end:end) - * or - * (start:mtrr_start) (mtrr_start:end) + * We split the region into either + * + * - start_state:1 + * (start:mtrr_end)(mtrr_end:end) + * - end_state:1 + * (start:mtrr_start)(mtrr_start:end) + * - inclusive:1 + * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end) + * * depending on kind of overlap. - * Return the type for first region and a pointer to - * the start of second region so that caller will - * lookup again on the second region. + * + * Return the type of the first region and a pointer + * to the start of next region so that caller will be + * advised to lookup again after having adjusted start + * and end. + * * Note: This way we handle multiple overlaps as well. */ if (start_state) -- cgit v1.2.1 From 9b3aca620883fc06636737c82a4d024b22182281 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:06 +0200 Subject: x86/mm/mtrr: Fix MTRR state checks in mtrr_type_lookup() 'mtrr_state.enabled' contains the FE (fixed MTRRs enabled) and E (MTRRs enabled) flags in MSR_MTRRdefType. Intel SDM, section 11.11.2.1, defines these flags as follows: - All MTRRs are disabled when the E flag is clear. The FE flag has no affect when the E flag is clear. - The default type is enabled when the E flag is set. - MTRR variable ranges are enabled when the E flag is set. - MTRR fixed ranges are enabled when both E and FE flags are set. MTRR state checks in __mtrr_type_lookup() do not match with SDM. Hence, this patch makes the following changes: - The current code detects MTRRs disabled when both E and FE flags are clear in mtrr_state.enabled. Fix to detect MTRRs disabled when the E flag is clear. - The current code does not check if the FE bit is set in mtrr_state.enabled when looking at the fixed entries. Fix to check the FE flag. - The current code returns the default type when the E flag is clear in mtrr_state.enabled. However, the default type is UC when the E flag is clear. Remove the code as this case is handled as MTRR disabled with the 1st change. In addition, this patch defines the E and FE flags in mtrr_state.enabled as follows. - FE flag: MTRR_STATE_MTRR_FIXED_ENABLED - E flag: MTRR_STATE_MTRR_ENABLED print_mtrr_state() and x86_get_mtrr_mem_range() are also updated accordingly. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-4-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mtrr.h | 4 ++++ arch/x86/kernel/cpu/mtrr/cleanup.c | 3 ++- arch/x86/kernel/cpu/mtrr/generic.c | 15 ++++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index f768f6298419..ef927948657c 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -127,4 +127,8 @@ struct mtrr_gentry32 { _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) #endif /* CONFIG_COMPAT */ +/* Bit fields for enabled in struct mtrr_state_type */ +#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01 +#define MTRR_STATE_MTRR_ENABLED 0x02 + #endif /* _ASM_X86_MTRR_H */ diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5f90b85ff22e..70d7c93f4550 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, continue; base = range_state[i].base_pfn; if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && - (mtrr_state.enabled & 1)) { + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { /* Var MTRR contains UC entry below 1M? Skip it: */ printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e202d26f64a2..b0599dbb899a 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -119,14 +119,16 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) if (!mtrr_state_set) return 0xFF; - if (!mtrr_state.enabled) + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) return 0xFF; /* Make end inclusive end, instead of exclusive */ end--; /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state.have_fixed && (start < 0x100000)) { + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { int idx; if (start < 0x80000) { @@ -149,9 +151,6 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state.enabled & 2)) - return mtrr_state.def_type; - prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -355,7 +354,9 @@ static void __init print_mtrr_state(void) mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { pr_debug("MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? + "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -368,7 +369,7 @@ static void __init print_mtrr_state(void) print_fixed_last(); } pr_debug("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { -- cgit v1.2.1 From 3d3ca416d9b0784cfcf244eeeba1bcaf421bc64d Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:07 +0200 Subject: x86/mm/mtrr: Use symbolic define as a retval for disabled MTRRs mtrr_type_lookup() returns verbatim 0xFF when MTRRs are disabled. This patch defines MTRR_TYPE_INVALID to clarify the meaning of this value, and documents its usage. Document the return values of the kernel virtual address mapping helpers pud_set_huge(), pmd_set_huge, pud_clear_huge() and pmd_clear_huge(). There is no functional change in this patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-5-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mtrr.h | 2 +- arch/x86/include/uapi/asm/mtrr.h | 8 +++++++- arch/x86/kernel/cpu/mtrr/generic.c | 14 ++++++------- arch/x86/mm/pgtable.c | 42 +++++++++++++++++++++++++++++--------- 4 files changed, 47 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index ef927948657c..bb03a547c1ab 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -55,7 +55,7 @@ static inline u8 mtrr_type_lookup(u64 addr, u64 end) /* * Return no-MTRRs: */ - return 0xff; + return MTRR_TYPE_INVALID; } #define mtrr_save_fixed_ranges(arg) do {} while (0) #define mtrr_save_state() do {} while (0) diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h index d0acb658c8f4..7528dcf59691 100644 --- a/arch/x86/include/uapi/asm/mtrr.h +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -103,7 +103,7 @@ struct mtrr_state_type { #define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry) #define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry) -/* These are the region types */ +/* MTRR memory types, which are defined in SDM */ #define MTRR_TYPE_UNCACHABLE 0 #define MTRR_TYPE_WRCOMB 1 /*#define MTRR_TYPE_ 2*/ @@ -113,5 +113,11 @@ struct mtrr_state_type { #define MTRR_TYPE_WRBACK 6 #define MTRR_NUM_TYPES 7 +/* + * Invalid MTRR memory type. mtrr_type_lookup() returns this value when + * MTRRs are disabled. Note, this value is allocated from the reserved + * values (0x7-0xff) of the MTRR memory types. + */ +#define MTRR_TYPE_INVALID 0xff #endif /* _UAPI_ASM_X86_MTRR_H */ diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index b0599dbb899a..7b1491c6232d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -104,7 +104,7 @@ static int check_type_overlap(u8 *prev, u8 *curr) /* * Error/Semi-error returns: - * 0xFF - when MTRR is not enabled + * MTRR_TYPE_INVALID - when MTRR is not enabled * *repeat == 1 implies [start:end] spanned across MTRR range and type returned * corresponds only to [start:*partial_end]. * Caller has to lookup again for [*partial_end:end]. @@ -117,10 +117,10 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) *repeat = 0; if (!mtrr_state_set) - return 0xFF; + return MTRR_TYPE_INVALID; if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) - return 0xFF; + return MTRR_TYPE_INVALID; /* Make end inclusive end, instead of exclusive */ end--; @@ -151,7 +151,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - prev_match = 0xFF; + prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -206,7 +206,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) continue; curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { + if (prev_match == MTRR_TYPE_INVALID) { prev_match = curr_match; continue; } @@ -220,7 +220,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) return MTRR_TYPE_WRBACK; } - if (prev_match != 0xFF) + if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; @@ -229,7 +229,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) /* * Returns the effective MTRR type for the region * Error return: - * 0xFF - when MTRR is not enabled + * MTRR_TYPE_INVALID - when MTRR is not enabled */ u8 mtrr_type_lookup(u64 start, u64 end) { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0b97d2c75df3..c30f9819786b 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -563,16 +563,22 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +/** + * pud_set_huge - setup kernel PUD mapping + * + * MTRR can override PAT memory types with 4KiB granularity. Therefore, + * this function does not set up a huge page when the range is covered + * by a non-WB type of MTRR. MTRR_TYPE_INVALID indicates that MTRR are + * disabled. + * + * Returns 1 on success and 0 on failure. + */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 mtrr; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != MTRR_TYPE_INVALID)) return 0; prot = pgprot_4k_2_large(prot); @@ -584,16 +590,22 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pmd_set_huge - setup kernel PMD mapping + * + * MTRR can override PAT memory types with 4KiB granularity. Therefore, + * this function does not set up a huge page when the range is covered + * by a non-WB type of MTRR. MTRR_TYPE_INVALID indicates that MTRR are + * disabled. + * + * Returns 1 on success and 0 on failure. + */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 mtrr; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != MTRR_TYPE_INVALID)) return 0; prot = pgprot_4k_2_large(prot); @@ -605,6 +617,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ int pud_clear_huge(pud_t *pud) { if (pud_large(*pud)) { @@ -615,6 +632,11 @@ int pud_clear_huge(pud_t *pud) return 0; } +/** + * pmd_clear_huge - clear kernel PMD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PMD map is found). + */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_large(*pmd)) { -- cgit v1.2.1 From 0cc705f56e400764a171055f727d28a48260bb4b Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:08 +0200 Subject: x86/mm/mtrr: Clean up mtrr_type_lookup() MTRRs contain fixed and variable entries. mtrr_type_lookup() may repeatedly call __mtrr_type_lookup() to handle a request that overlaps with variable entries. However, __mtrr_type_lookup() also handles the fixed entries, which do not have to be repeated. Therefore, this patch creates separate functions, mtrr_type_lookup_fixed() and mtrr_type_lookup_variable(), to handle the fixed and variable ranges respectively. The patch also updates the function headers to clarify the return values and output argument. It updates comments to clarify that the repeating is necessary to handle overlaps with the default type, since overlaps with multiple entries alone can be handled without such repeating. There is no functional change in this patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-6-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 138 +++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7b1491c6232d..e51100c49eea 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -102,55 +102,68 @@ static int check_type_overlap(u8 *prev, u8 *curr) return 0; } -/* - * Error/Semi-error returns: - * MTRR_TYPE_INVALID - when MTRR is not enabled - * *repeat == 1 implies [start:end] spanned across MTRR range and type returned - * corresponds only to [start:*partial_end]. - * Caller has to lookup again for [*partial_end:end]. +/** + * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries + * + * Return the MTRR fixed memory type of 'start'. + * + * MTRR fixed entries are divided into the following ways: + * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges + * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges + * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges + * + * Return Values: + * MTRR_TYPE_(type) - Matched memory type + * MTRR_TYPE_INVALID - Unmatched + */ +static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +{ + int idx; + + if (start >= 0x100000) + return MTRR_TYPE_INVALID; + + /* 0x0 - 0x7FFFF */ + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state.fixed_ranges[idx]; + /* 0x80000 - 0xBFFFF */ + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state.fixed_ranges[idx]; + } + + /* 0xC0000 - 0xFFFFF */ + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state.fixed_ranges[idx]; +} + +/** + * mtrr_type_lookup_variable - look up memory type in MTRR variable entries + * + * Return Value: + * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) + * + * Output Argument: + * repeat - Set to 1 when [start:end] spanned across MTRR range and type + * returned corresponds only to [start:*partial_end]. Caller has + * to lookup again for [*partial_end:end]. */ -static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) +static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, + int *repeat) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; - if (!mtrr_state_set) - return MTRR_TYPE_INVALID; - - if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) - return MTRR_TYPE_INVALID; - /* Make end inclusive end, instead of exclusive */ + /* Make end inclusive instead of exclusive */ end--; - /* Look in fixed ranges. Just return the type as per start */ - if ((start < 0x100000) && - (mtrr_state.have_fixed) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state.fixed_ranges[idx]; - } else { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state.fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -186,7 +199,8 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * advised to lookup again after having adjusted start * and end. * - * Note: This way we handle multiple overlaps as well. + * Note: This way we handle overlaps with multiple + * entries and the default type properly. */ if (start_state) *partial_end = base + get_mtrr_size(mask); @@ -215,21 +229,18 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) return curr_match; } - if (mtrr_tom2) { - if (start >= (1ULL<<32) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; - } - if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; } -/* - * Returns the effective MTRR type for the region - * Error return: - * MTRR_TYPE_INVALID - when MTRR is not enabled +/** + * mtrr_type_lookup - look up memory type in MTRR + * + * Return Values: + * MTRR_TYPE_(type) - The effective MTRR type for the region + * MTRR_TYPE_INVALID - MTRR is disabled */ u8 mtrr_type_lookup(u64 start, u64 end) { @@ -237,22 +248,45 @@ u8 mtrr_type_lookup(u64 start, u64 end) int repeat; u64 partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + if (!mtrr_state_set) + return MTRR_TYPE_INVALID; + + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) + return MTRR_TYPE_INVALID; + + /* + * Look up the fixed ranges first, which take priority over + * the variable ranges. + */ + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) + return mtrr_type_lookup_fixed(start, end); + + /* + * Look up the variable ranges. Look of multiple ranges matching + * this address and pick type as per MTRR precedence. + */ + type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); /* * Common path is with repeat = 0. * However, we can have cases where [start:end] spans across some - * MTRR range. Do repeated lookups for that case here. + * MTRR ranges and/or the default type. Do repeated lookups for + * that case here. */ while (repeat) { prev_type = type; start = partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); if (check_type_overlap(&prev_type, &type)) return type; } + if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) + return MTRR_TYPE_WRBACK; + return type; } -- cgit v1.2.1 From b73522e0c1be58d3c69b124985b8ccf94e3677f7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:10 +0200 Subject: x86/mm/mtrr: Enhance MTRR checks in kernel mapping helpers This patch adds the argument 'uniform' to mtrr_type_lookup(), which gets set to 1 when a given range is covered uniformly by MTRRs, i.e. the range is fully covered by a single MTRR entry or the default type. Change pud_set_huge() and pmd_set_huge() to honor the 'uniform' flag to see if it is safe to create a huge page mapping in the range. This allows them to create a huge page mapping in a range covered by a single MTRR entry of any memory type. It also detects a non-optimal request properly. They continue to check with the WB type since it does not effectively change the uniform mapping even if a request spans multiple MTRR entries. pmd_set_huge() logs a warning message to a non-optimal request so that driver writers will be aware of such a case. Drivers should make a mapping request aligned to a single MTRR entry when the range is covered by MTRRs. Signed-off-by: Toshi Kani [ Realign, flesh out comments, improve warning message. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-7-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mtrr.h | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 40 ++++++++++++++++++++++++++++---------- arch/x86/mm/pat.c | 4 ++-- arch/x86/mm/pgtable.c | 38 +++++++++++++++++++++++------------- 4 files changed, 58 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index bb03a547c1ab..a31759e1edd9 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -31,7 +31,7 @@ * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR -extern u8 mtrr_type_lookup(u64 addr, u64 end); +extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); extern int mtrr_add(unsigned long base, unsigned long size, @@ -50,7 +50,7 @@ extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); extern int phys_wc_to_mtrr_index(int handle); # else -static inline u8 mtrr_type_lookup(u64 addr, u64 end) +static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { /* * Return no-MTRRs: diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e51100c49eea..f782d9b62cb3 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -147,19 +147,24 @@ static u8 mtrr_type_lookup_fixed(u64 start, u64 end) * Return Value: * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) * - * Output Argument: + * Output Arguments: * repeat - Set to 1 when [start:end] spanned across MTRR range and type * returned corresponds only to [start:*partial_end]. Caller has * to lookup again for [*partial_end:end]. + * + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, - int *repeat) + int *repeat, u8 *uniform) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; + *uniform = 1; /* Make end inclusive instead of exclusive */ end--; @@ -214,6 +219,7 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, end = *partial_end - 1; /* end is inclusive */ *repeat = 1; + *uniform = 0; } if ((start & mask) != (base & mask)) @@ -225,6 +231,7 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, continue; } + *uniform = 0; if (check_type_overlap(&prev_match, &curr_match)) return curr_match; } @@ -241,10 +248,15 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, * Return Values: * MTRR_TYPE_(type) - The effective MTRR type for the region * MTRR_TYPE_INVALID - MTRR is disabled + * + * Output Argument: + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ -u8 mtrr_type_lookup(u64 start, u64 end) +u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { - u8 type, prev_type; + u8 type, prev_type, is_uniform = 1, dummy; int repeat; u64 partial_end; @@ -260,14 +272,18 @@ u8 mtrr_type_lookup(u64 start, u64 end) */ if ((start < 0x100000) && (mtrr_state.have_fixed) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) - return mtrr_type_lookup_fixed(start, end); + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { + is_uniform = 0; + type = mtrr_type_lookup_fixed(start, end); + goto out; + } /* * Look up the variable ranges. Look of multiple ranges matching * this address and pick type as per MTRR precedence. */ - type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &is_uniform); /* * Common path is with repeat = 0. @@ -278,15 +294,19 @@ u8 mtrr_type_lookup(u64 start, u64 end) while (repeat) { prev_type = type; start = partial_end; - type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); + is_uniform = 0; + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &dummy); if (check_type_overlap(&prev_type, &type)) - return type; + goto out; } if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; + type = MTRR_TYPE_WRBACK; +out: + *uniform = is_uniform; return type; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 35af6771a95a..372ad422c2c3 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -267,9 +267,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, * request is for WB. */ if (req_type == _PAGE_CACHE_MODE_WB) { - u8 mtrr_type; + u8 mtrr_type, uniform; - mtrr_type = mtrr_type_lookup(start, end); + mtrr_type = mtrr_type_lookup(start, end, &uniform); if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c30f9819786b..fb0a9dd1d6e4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -566,19 +566,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, /** * pud_set_huge - setup kernel PUD mapping * - * MTRR can override PAT memory types with 4KiB granularity. Therefore, - * this function does not set up a huge page when the range is covered - * by a non-WB type of MTRR. MTRR_TYPE_INVALID indicates that MTRR are - * disabled. + * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this + * function sets up a huge page only if any of the following conditions are met: + * + * - MTRRs are disabled, or + * + * - MTRRs are enabled and the range is completely covered by a single MTRR, or + * + * - MTRRs are enabled and the corresponding MTRR memory type is WB, which + * has no effect on the requested PAT memory type. + * + * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger + * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != MTRR_TYPE_INVALID)) + mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) return 0; prot = pgprot_4k_2_large(prot); @@ -593,20 +602,21 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) /** * pmd_set_huge - setup kernel PMD mapping * - * MTRR can override PAT memory types with 4KiB granularity. Therefore, - * this function does not set up a huge page when the range is covered - * by a non-WB type of MTRR. MTRR_TYPE_INVALID indicates that MTRR are - * disabled. + * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != MTRR_TYPE_INVALID)) + mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) { + pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", + __func__, addr, addr + PMD_SIZE); return 0; + } prot = pgprot_4k_2_large(prot); -- cgit v1.2.1 From 9e76561f6a8a1a1c4f3152a3fb403ef9d6cfc2ff Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:11 +0200 Subject: x86/mm/pat: Convert to pr_*() usage Use pr_info() instead of the old printk to prefix the component where things are coming from. With this readers will know exactly where the message is coming from. We use pr_* helpers but define pr_fmt to the empty string for easier grepping for those error messages. We leave the users of dprintk() in place, this will print only when the debugpat kernel parameter is enabled. We want to leave those enabled as a debug feature, but also make them use the same prefix. Signed-off-by: Luis R. Rodriguez [ Kill pr_fmt. ] Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Andy Walls Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Denys Vlasenko Cc: Doug Ledford Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cocci@systeme.lip6.fr Cc: plagnioj@jcrosoft.com Cc: tomi.valkeinen@ti.com Link: http://lkml.kernel.org/r/1430425520-22275-2-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-9-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 44 ++++++++++++++++++++++---------------------- arch/x86/mm/pat_internal.h | 2 +- arch/x86/mm/pat_rbtree.c | 6 +++--- 3 files changed, 26 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 372ad422c2c3..8c50b9bfa996 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -33,13 +33,16 @@ #include "pat_internal.h" #include "mm_internal.h" +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; static inline void pat_disable(const char *reason) { pat_enabled = 0; - printk(KERN_INFO "%s\n", reason); + pr_info("x86/PAT: %s\n", reason); } static int __init nopat(char *str) @@ -188,7 +191,7 @@ void pat_init_cache_modes(void) pat_msg + 4 * i); update_cache_mode_entry(i, cache); } - pr_info("PAT configuration [0-7]: %s\n", pat_msg); + pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) @@ -211,8 +214,7 @@ void pat_init(void) * switched to PAT on the boot CPU. We have no way to * undo PAT. */ - printk(KERN_ERR "PAT enabled, " - "but not supported by secondary CPU\n"); + pr_err("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); BUG(); } } @@ -347,7 +349,7 @@ static int reserve_ram_pages_type(u64 start, u64 end, page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != -1) { - pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", + pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -451,9 +453,9 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", - start, end - 1, - cattr_name(new->type), cattr_name(req_type)); + pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); @@ -497,8 +499,8 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (!entry) { - printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", - current->comm, current->pid, start, end - 1); + pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); return -EINVAL; } @@ -628,8 +630,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", - current->comm, from, to - 1); + pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -698,8 +700,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { - printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " - "for [mem %#010Lx-%#010Lx]\n", + pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(pcm), base, (unsigned long long)(base + size-1)); @@ -734,7 +735,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, pcm = lookup_memtype(paddr); if (want_pcm != pcm) { - printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", + pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_pcm), (unsigned long long)paddr, @@ -755,13 +756,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, if (strict_prot || !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { free_memtype(paddr, paddr + size); - printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for [mem %#010Lx-%#010Lx], got %s\n", - current->comm, current->pid, - cattr_name(want_pcm), - (unsigned long long)paddr, - (unsigned long long)(paddr + size - 1), - cattr_name(pcm)); + pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_pcm), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(pcm)); return -EINVAL; } /* diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index f6411620305d..a739bfc40690 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -4,7 +4,7 @@ extern int pat_debug_enable; #define dprintk(fmt, arg...) \ - do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) + do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0) struct memtype { u64 start; diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 6582adcc8bd9..63931080366a 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -160,9 +160,9 @@ success: return 0; failure: - printk(KERN_INFO "%s:%d conflicting memory types " - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, - end, cattr_name(found_type), cattr_name(match->type)); + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(match->type)); return -EBUSY; } -- cgit v1.2.1 From 2f9e897353fcb99effd6eff22f7b464f8e2a659a Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:12 +0200 Subject: x86/mm/mtrr, pat: Document Write Combining MTRR type effects on PAT / non-PAT pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of the effort to phase out MTRR use document write-combining MTRR effects on pages with different non-PAT page attributes flags and different PAT entry values. Extend arch_phys_wc_add() documentation to clarify power of two sizes / boundary requirements as we phase out mtrr_add() use. Lastly hint towards ioremap_uc() for corner cases on device drivers working with devices with mixed regions where MTRR size requirements would otherwise not enable write-combining effective memory types. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Jonathan Corbet Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: linux-fbdev@vger.kernel.org Link: http://lkml.kernel.org/r/1430343851-967-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index ea5f363a1948..04aceb7e6443 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -538,6 +538,9 @@ EXPORT_SYMBOL(mtrr_del); * attempts to add a WC MTRR covering size bytes starting at base and * logs an error if this fails. * + * The called should provide a power of two size on an equivalent + * power of two boundary. + * * Drivers must store the return value to pass to mtrr_del_wc_if_needed, * but drivers should not try to interpret that return value. */ -- cgit v1.2.1 From 7d010fdf299929f9583ce5e17da629dcd83c36ef Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:13 +0200 Subject: x86/mm/mtrr: Avoid #ifdeffery with phys_wc_to_mtrr_index() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is only one user but since we're going to bury MTRR next out of access to drivers, expose this last piece of API to drivers in a general fashion only needing io.h for access to helpers. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Abhilash Kesavan Cc: Andrew Morton Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: Cristian Stoica Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Juergen Gross Cc: Linus Torvalds Cc: Matthias Brugger Cc: Mel Gorman Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Thierry Reding Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Toshi Kani Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: Will Deacon Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1429722736-4473-1-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 3 +++ arch/x86/include/asm/mtrr.h | 5 ----- arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 4afc05ffa566..a2b97404922d 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -339,6 +339,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff #ifdef CONFIG_MTRR +extern int __must_check arch_phys_wc_index(int handle); +#define arch_phys_wc_index arch_phys_wc_index + extern int __must_check arch_phys_wc_add(unsigned long base, unsigned long size); extern void arch_phys_wc_del(int handle); diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a31759e1edd9..b94f6f64e23d 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -48,7 +48,6 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); -extern int phys_wc_to_mtrr_index(int handle); # else static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { @@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -static inline int phys_wc_to_mtrr_index(int handle) -{ - return -1; -} #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 04aceb7e6443..81baf5fee0e1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -580,7 +580,7 @@ void arch_phys_wc_del(int handle) EXPORT_SYMBOL(arch_phys_wc_del); /* - * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * arch_phys_wc_index - translates arch_phys_wc_add's return value * @handle: Return value from arch_phys_wc_add * * This will turn the return value from arch_phys_wc_add into an mtrr @@ -590,14 +590,14 @@ EXPORT_SYMBOL(arch_phys_wc_del); * in printk line. Alas there is an illegitimate use in some ancient * drm ioctls. */ -int phys_wc_to_mtrr_index(int handle) +int arch_phys_wc_index(int handle) { if (handle < MTRR_TO_PHYS_WC_OFFSET) return -1; else return handle - MTRR_TO_PHYS_WC_OFFSET; } -EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); +EXPORT_SYMBOL_GPL(arch_phys_wc_index); /* * HACK ALERT! -- cgit v1.2.1 From f9626104a5b6815ec7d65789dfb900af5fa51e64 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:14 +0200 Subject: x86/mm/mtrr: Generalize runtime disabling of MTRRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible to enable CONFIG_MTRR and CONFIG_X86_PAT and end up with a system with MTRR functionality disabled but PAT functionality enabled. This can happen, for instance, when the Xen hypervisor is used where MTRRs are not supported but PAT is. This can happen on Linux as of commit 47591df50512 ("xen: Support Xen pv-domains using PAT") by Juergen, introduced in v3.19. Technically, we should assume the proper CPU bits would be set to disable MTRRs but we can't always rely on this. At least on the Xen Hypervisor, for instance, only X86_FEATURE_MTRR was disabled as of Xen 4.4 through Xen commit 586ab6a [0], but not X86_FEATURE_K6_MTRR, X86_FEATURE_CENTAUR_MCR, or X86_FEATURE_CYRIX_ARR for instance. Roger Pau Monné has clarified though that although this is technically true we will never support PVH on these CPU types so Xen has no need to disable these bits on those systems. As per Roger, AMD K6, Centaur and VIA chips don't have the necessary hardware extensions to allow running PVH guests [1]. As per Toshi it is also possible for the BIOS to disable MTRR support, in such cases get_mtrr_state() would update the MTRR state as per the BIOS, we need to propagate this information as well. x86 MTRR code relies on quite a bit of checks for mtrr_if being set to check to see if MTRRs did get set up. Instead, lets provide a generic getter for that. This also adds a few checks where they were not before which could potentially safeguard ourselves against incorrect usage of MTRR where this was not desirable. Where possible match error codes as if MTRRs were disabled on arch/x86/include/asm/mtrr.h. Lastly, since disabling MTRRs can happen at run time and we could end up with PAT enabled, best record now in our logs when MTRRs are disabled. [0] ~/devel/xen (git::stable-4.5)$ git describe --contains 586ab6a 4.4.0-rc1~18 [1] http://lists.xenproject.org/archives/html/xen-devel/2015-03/msg03460.html Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Roger Pau Monné Cc: Stefan Bader Cc: Suresh Siddha Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Toshi Kani Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: bhelgaas@google.com Cc: david.vrabel@citrix.com Cc: jbeulich@suse.com Cc: konrad.wilk@oracle.com Cc: venkatesh.pallipadi@intel.com Cc: ville.syrjala@linux.intel.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1426893517-2511-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-12-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 4 +++- arch/x86/kernel/cpu/mtrr/main.c | 39 ++++++++++++++++++++++++++++++-------- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 +- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index f782d9b62cb3..3b533cf37c74 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -445,7 +445,7 @@ static void __init print_mtrr_state(void) } /* Grab all of the MTRR state for this CPU into *state */ -void __init get_mtrr_state(void) +bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; unsigned long flags; @@ -489,6 +489,8 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); + + return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } /* Some BIOS's are messed up and don't set all MTRRs the same! */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 81baf5fee0e1..383efb26e516 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -59,6 +59,12 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; +static bool __mtrr_enabled; + +static bool mtrr_enabled(void) +{ + return __mtrr_enabled; +} unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); @@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, int i, replace, error; mtrr_type ltype; - if (!mtrr_if) + if (!mtrr_enabled()) return -ENXIO; error = mtrr_if->validate_add_page(base, size, type); @@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size) int mtrr_add(unsigned long base, unsigned long size, unsigned int type, bool increment) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, @@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) unsigned long lbase, lsize; int error = -EINVAL; - if (!mtrr_if) - return -ENXIO; + if (!mtrr_enabled()) + return -ENODEV; max = num_var_ranges; /* No CPU hotplug when we change MTRR entries */ @@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) */ int mtrr_del(int reg, unsigned long base, unsigned long size) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); @@ -548,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size) { int ret; - if (pat_enabled) + if (pat_enabled || !mtrr_enabled()) return 0; /* Success! (We don't need to do anything.) */ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); @@ -737,10 +747,12 @@ void __init mtrr_bp_init(void) } if (mtrr_if) { + __mtrr_enabled = true; set_num_var_ranges(); init_table(); if (use_intel()) { - get_mtrr_state(); + /* BIOS may override */ + __mtrr_enabled = get_mtrr_state(); if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; @@ -748,10 +760,16 @@ void __init mtrr_bp_init(void) } } } + + if (!mtrr_enabled()) + pr_info("MTRR: Disabled\n"); } void mtrr_ap_init(void) { + if (!mtrr_enabled()) + return; + if (!use_intel() || mtrr_aps_delayed_init) return; /* @@ -777,6 +795,9 @@ void mtrr_save_state(void) { int first_cpu; + if (!mtrr_enabled()) + return; + get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); @@ -785,6 +806,8 @@ void mtrr_save_state(void) void set_mtrr_aps_delayed_init(void) { + if (!mtrr_enabled()) + return; if (!use_intel()) return; @@ -796,7 +819,7 @@ void set_mtrr_aps_delayed_init(void) */ void mtrr_aps_init(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; /* @@ -813,7 +836,7 @@ void mtrr_aps_init(void) void mtrr_bp_restore(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; mtrr_if->set_all(); @@ -821,7 +844,7 @@ void mtrr_bp_restore(void) static int __init mtrr_init_finialize(void) { - if (!mtrr_if) + if (!mtrr_enabled()) return 0; if (use_intel()) { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index df5e41f31a27..951884dcc433 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); -void get_mtrr_state(void); +bool get_mtrr_state(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); -- cgit v1.2.1 From cb32edf65bf2197a2d2226e94c7602dc92e295bb Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:15 +0200 Subject: x86/mm/pat: Wrap pat_enabled into a function API We use pat_enabled in x86-specific code to see if PAT is enabled or not but we're granting full access to it even though readers do not need to set it. If, for instance, we granted access to it to modules later they then could override the variable setting... no bueno. This renames pat_enabled to a new static variable __pat_enabled. Folks are redirected to use pat_enabled() now. Code that sets this can only be internal to pat.c. Apart from the early kernel parameter "nopat" to disable PAT, we also have a few cases that disable it later and make use of a helper pat_disable(). It is wrapped under an ifdef but since that code cannot run unless PAT was enabled its not required to wrap it with ifdefs, unwrap that. Likewise, since "nopat" doesn't really change non-PAT systems just remove that ifdef as well. Although we could add and use an early_param_off(), these helpers don't use __read_mostly but we want to keep __read_mostly for __pat_enabled as this is a hot path -- upon boot, for instance, a simple guest may see ~4k accesses to pat_enabled(). Since __read_mostly early boot params are not that common we don't add a helper for them just yet. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Andy Walls Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Christoph Lameter Cc: Daniel Vetter Cc: Dave Airlie Cc: Denys Vlasenko Cc: Doug Ledford Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kyle McMartin Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430425520-22275-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 7 +------ arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/mm/iomap_32.c | 2 +- arch/x86/mm/ioremap.c | 4 ++-- arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pat.c | 33 +++++++++++++++------------------ arch/x86/pci/i386.c | 6 +++--- 7 files changed, 24 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 91bc4ba95f91..cdcff7f7f694 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -4,12 +4,7 @@ #include #include -#ifdef CONFIG_X86_PAT -extern int pat_enabled; -#else -static const int pat_enabled; -#endif - +bool pat_enabled(void); extern void pat_init(void); void pat_init_cache_modes(void); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 383efb26e516..e7ed0d8ebacb 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -558,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size) { int ret; - if (pat_enabled || !mtrr_enabled()) + if (pat_enabled() || !mtrr_enabled()) return 0; /* Success! (We don't need to do anything.) */ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 9ca35fc60cfe..3a2ec8790ca7 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -82,7 +82,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) * MTRR is UC or WC. UC_MINUS gets the real intention, of the * user, which is "WC if the MTRR is WC, UC if you can't do that." */ - if (!pat_enabled && pgprot_val(prot) == + if (!pat_enabled() && pgprot_val(prot) == (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC))) prot = __pgprot(__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a493bb83aa89..82d63ed70045 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -234,7 +234,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; + * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use * UC MINUS. Drivers that are certain they need or can already @@ -292,7 +292,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc); */ void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { - if (pat_enabled) + if (pat_enabled()) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, __builtin_return_address(0)); else diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 397838eb292b..70d221fe2eb4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1571,7 +1571,7 @@ int set_memory_wc(unsigned long addr, int numpages) { int ret; - if (!pat_enabled) + if (!pat_enabled()) return set_memory_uc(addr, numpages); ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 8c50b9bfa996..484dce7f759b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -36,12 +36,11 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt -#ifdef CONFIG_X86_PAT -int __read_mostly pat_enabled = 1; +static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); static inline void pat_disable(const char *reason) { - pat_enabled = 0; + __pat_enabled = 0; pr_info("x86/PAT: %s\n", reason); } @@ -51,13 +50,11 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); -#else -static inline void pat_disable(const char *reason) + +bool pat_enabled(void) { - (void)reason; + return !!__pat_enabled; } -#endif - int pat_debug_enable; @@ -201,7 +198,7 @@ void pat_init(void) u64 pat; bool boot_cpu = !boot_pat_state; - if (!pat_enabled) + if (!pat_enabled()) return; if (!cpu_has_pat) { @@ -402,7 +399,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, BUG_ON(start >= end); /* end is exclusive */ - if (!pat_enabled) { + if (!pat_enabled()) { /* This is identical to page table setting without PAT */ if (new_type) { if (req_type == _PAGE_CACHE_MODE_WC) @@ -477,7 +474,7 @@ int free_memtype(u64 start, u64 end) int is_range_ram; struct memtype *entry; - if (!pat_enabled) + if (!pat_enabled()) return 0; /* Low ISA region is always mapped WB. No need to track */ @@ -625,7 +622,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) u64 to = from + size; u64 cursor = from; - if (!pat_enabled) + if (!pat_enabled()) return 1; while (cursor < to) { @@ -661,7 +658,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * caching for the high addresses through the KEN pin, but * we maintain the tradition of paranoia in this code. */ - if (!pat_enabled && + if (!pat_enabled() && !(boot_cpu_has(X86_FEATURE_MTRR) || boot_cpu_has(X86_FEATURE_K6_MTRR) || boot_cpu_has(X86_FEATURE_CYRIX_ARR) || @@ -730,7 +727,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, * the type requested matches the type of first page in the range. */ if (is_ram) { - if (!pat_enabled) + if (!pat_enabled()) return 0; pcm = lookup_memtype(paddr); @@ -844,7 +841,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return ret; } - if (!pat_enabled) + if (!pat_enabled()) return 0; /* @@ -872,7 +869,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, { enum page_cache_mode pcm; - if (!pat_enabled) + if (!pat_enabled()) return 0; /* Set prot based on lookup */ @@ -913,7 +910,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, pgprot_t pgprot_writecombine(pgprot_t prot) { - if (pat_enabled) + if (pat_enabled()) return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); else @@ -996,7 +993,7 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - if (pat_enabled) { + if (pat_enabled()) { debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, NULL, &memtype_fops); } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 349c0d32cc0b..0a9f2caf358f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, * Caller can followup with UC MINUS request and add a WC mtrr if there * is a free mtrr slot. */ - if (!pat_enabled && write_combine) + if (!pat_enabled() && write_combine) return -EINVAL; - if (pat_enabled && write_combine) + if (pat_enabled() && write_combine) prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); - else if (pat_enabled || boot_cpu_data.x86 > 3) + else if (pat_enabled() || boot_cpu_data.x86 > 3) /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here -- cgit v1.2.1 From fbe7193aa4787f27c84216d130ab877efc310d57 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:16 +0200 Subject: x86/mm/pat: Export pat_enabled() Two Linux device drivers cannot work with PAT and the work required to make them work is significant. There is not enough motivation to convert these drivers over to use PAT properly, the compromise reached is to let drivers that cannot be ported to PAT check if PAT was enabled and if so fail on probe with a recommendation to boot with the "nopat" kernel parameter. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Andy Walls Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Denys Vlasenko Cc: Doug Ledford Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430425520-22275-4-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-14-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 484dce7f759b..a1c96544099d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -55,6 +55,7 @@ bool pat_enabled(void) { return !!__pat_enabled; } +EXPORT_SYMBOL_GPL(pat_enabled); int pat_debug_enable; -- cgit v1.2.1 From 06931e62246844c73fba24d7aeb4a5dc897a2739 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 May 2015 15:11:28 +0200 Subject: sched/topology: Rename topology_thread_cpumask() to topology_sibling_cpumask() Rename topology_thread_cpumask() to topology_sibling_cpumask() for more consistency with scheduler code. Signed-off-by: Bartosz Golaszewski Reviewed-by: Thomas Gleixner Acked-by: Russell King Acked-by: Catalin Marinas Cc: Benoit Cousson Cc: Fenghua Yu Cc: Guenter Roeck Cc: Jean Delvare Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Oleg Drokin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Russell King Cc: Viresh Kumar Link: http://lkml.kernel.org/r/1432645896-12588-2-git-send-email-bgolaszewski@baylibre.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/topology.h | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 0e8f04f2c26f..5a77593fdace 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef ENABLE_TOPO_DEFINES #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) -#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) +#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3998131d1a68..324817735771 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2621,7 +2621,7 @@ static void intel_pmu_cpu_starting(int cpu) if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; - for_each_cpu(i, topology_thread_cpumask(cpu)) { + for_each_cpu(i, topology_sibling_cpumask(cpu)) { struct intel_shared_regs *pc; pc = per_cpu(cpu_hw_events, i).shared_regs; @@ -2641,7 +2641,7 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { int h = x86_pmu.num_counters >> 1; - for_each_cpu(i, topology_thread_cpumask(cpu)) { + for_each_cpu(i, topology_sibling_cpumask(cpu)) { struct intel_excl_cntrs *c; c = per_cpu(cpu_hw_events, i).excl_cntrs; @@ -3403,7 +3403,7 @@ static __init int fixup_ht_bug(void) if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) return 0; - w = cpumask_weight(topology_thread_cpumask(cpu)); + w = cpumask_weight(topology_sibling_cpumask(cpu)); if (w > 1) { pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); return 0; -- cgit v1.2.1 From 7d79a7bd7554d420313451fb805ebc37a8da97fe Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 May 2015 15:11:35 +0200 Subject: x86: Replace cpu_**_mask() with topology_**_cpumask() The former duplicate the functionalities of the latter but are neither documented nor arch-independent. Signed-off-by: Bartosz Golaszewski Reviewed-by: Thomas Gleixner Cc: Benoit Cousson Cc: Catalin Marinas Cc: Fenghua Yu Cc: Guenter Roeck Cc: Jean Delvare Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Oleg Drokin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Russell King Cc: Viresh Kumar Link: http://lkml.kernel.org/r/1432645896-12588-9-git-send-email-bgolaszewski@baylibre.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/proc.c | 3 ++- arch/x86/kernel/smpboot.c | 42 ++++++++++++++++++++++-------------------- arch/x86/kernel/tsc_sync.c | 2 +- 3 files changed, 25 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index e7d8c7608471..18ca99f2798b 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -12,7 +12,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, { #ifdef CONFIG_SMP seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); + seq_printf(m, "siblings\t: %d\n", + cpumask_weight(topology_core_cpumask(cpu))); seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); seq_printf(m, "apicid\t\t: %d\n", c->apicid); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..0e8209619455 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -314,10 +314,10 @@ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); } -#define link_mask(_m, c1, c2) \ +#define link_mask(mfunc, c1, c2) \ do { \ - cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ - cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ + cpumask_set_cpu((c1), mfunc(c2)); \ + cpumask_set_cpu((c2), mfunc(c1)); \ } while (0) static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) @@ -398,9 +398,9 @@ void set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, cpu_sibling_setup_mask); if (!has_mp) { - cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(cpu)); + cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); c->booted_cores = 1; return; } @@ -409,32 +409,34 @@ void set_cpu_sibling_map(int cpu) o = &cpu_data(i); if ((i == cpu) || (has_smt && match_smt(c, o))) - link_mask(sibling, cpu, i); + link_mask(topology_sibling_cpumask, cpu, i); if ((i == cpu) || (has_mp && match_llc(c, o))) - link_mask(llc_shared, cpu, i); + link_mask(cpu_llc_shared_mask, cpu, i); } /* * This needs a separate iteration over the cpus because we rely on all - * cpu_sibling_mask links to be set-up. + * topology_sibling_cpumask links to be set-up. */ for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); if ((i == cpu) || (has_mp && match_die(c, o))) { - link_mask(core, cpu, i); + link_mask(topology_core_cpumask, cpu, i); /* * Does this new cpu bringup a new core? */ - if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) { + if (cpumask_weight( + topology_sibling_cpumask(cpu)) == 1) { /* * for each core in package, increment * the booted_cores for this new cpu */ - if (cpumask_first(cpu_sibling_mask(i)) == i) + if (cpumask_first( + topology_sibling_cpumask(i)) == i) c->booted_cores++; /* * increment the core count for all @@ -1009,8 +1011,8 @@ static __init void disable_smp(void) physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); else physid_set_mask_of_physid(0, &phys_cpu_present_map); - cpumask_set_cpu(0, cpu_sibling_mask(0)); - cpumask_set_cpu(0, cpu_core_mask(0)); + cpumask_set_cpu(0, topology_sibling_cpumask(0)); + cpumask_set_cpu(0, topology_core_cpumask(0)); } enum { @@ -1293,22 +1295,22 @@ static void remove_siblinginfo(int cpu) int sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); - for_each_cpu(sibling, cpu_core_mask(cpu)) { - cpumask_clear_cpu(cpu, cpu_core_mask(sibling)); + for_each_cpu(sibling, topology_core_cpumask(cpu)) { + cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); /*/ * last thread sibling in this cpu core going down */ - if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) + if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1) cpu_data(sibling).booted_cores--; } - for_each_cpu(sibling, cpu_sibling_mask(cpu)) - cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling)); + for_each_cpu(sibling, topology_sibling_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); - cpumask_clear(cpu_sibling_mask(cpu)); - cpumask_clear(cpu_core_mask(cpu)); + cpumask_clear(topology_sibling_cpumask(cpu)); + cpumask_clear(topology_core_cpumask(cpu)); c->phys_proc_id = 0; c->cpu_core_id = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 26488487bc61..dd8d0791dfb5 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -113,7 +113,7 @@ static void check_tsc_warp(unsigned int timeout) */ static inline unsigned int loop_timeout(int cpu) { - return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20; + return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20; } /* -- cgit v1.2.1 From 960d447b94b22ceba286917056871d1dac8da697 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 May 2015 15:11:36 +0200 Subject: x86: Remove cpu_sibling_mask() and cpu_core_mask() These functions are arch-specific and duplicate the functionality of macros defined in linux/include/topology.h. Remove them as all the callers in x86 have now switched to using the topology_**_cpumask() family. Signed-off-by: Bartosz Golaszewski Reviewed-by: Thomas Gleixner Cc: Benoit Cousson Cc: Catalin Marinas Cc: Fenghua Yu Cc: Guenter Roeck Cc: Jean Delvare Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Oleg Drokin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Russell King Cc: Viresh Kumar Link: http://lkml.kernel.org/r/1432645896-12588-10-git-send-email-bgolaszewski@baylibre.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 17a8dced12da..222a6a3ca2b5 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -37,16 +37,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); -static inline struct cpumask *cpu_sibling_mask(int cpu) -{ - return per_cpu(cpu_sibling_map, cpu); -} - -static inline struct cpumask *cpu_core_mask(int cpu) -{ - return per_cpu(cpu_core_map, cpu); -} - static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return per_cpu(cpu_llc_shared_map, cpu); -- cgit v1.2.1 From e90328b87eeff91574c18815bd6991b03bd6ecc0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Apr 2015 18:16:02 -0700 Subject: mce: Stop using array-index-based RCU primitives Because mce is arch-specific x86 code, there is little or no performance benefit of using rcu_dereference_index_check() over using smp_load_acquire(). It also turns out that mce is the only place that array-index-based RCU is used, and it would be convenient to drop this portion of the RCU API. This patch therefore changes rcu_dereference_index_check() uses to smp_load_acquire(), but keeping the lockdep diagnostics, and also changes rcu_access_index() uses to READ_ONCE(). Signed-off-by: Paul E. McKenney Cc: linux-edac@vger.kernel.org Cc: Tony Luck Acked-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..1fd04a02d90a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -53,9 +53,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define rcu_dereference_check_mce(p) \ - rcu_dereference_index_check((p), \ - rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_chrdev_read_mutex)) +({ \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious rcu_dereference_check_mce() usage"); \ + smp_load_acquire(&(p)); \ +}) #define CREATE_TRACE_POINTS #include @@ -1884,7 +1887,7 @@ out: static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_chrdev_wait, wait); - if (rcu_access_index(mcelog.next)) + if (READ_ONCE(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; -- cgit v1.2.1 From 29c6820f5164b1432b1ba60b38175cb24367dd98 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Apr 2015 14:05:25 -0700 Subject: mce: mce_chrdev_write() can be static Signed-off-by: Fengguang Wu Signed-off-by: Paul E. McKenney --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1fd04a02d90a..d4298feaa6fb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1932,8 +1932,8 @@ void register_mce_write_callback(ssize_t (*fn)(struct file *filp, } EXPORT_SYMBOL_GPL(register_mce_write_callback); -ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) { if (mce_write) return mce_write(filp, ubuf, usize, off); -- cgit v1.2.1 From ad5fb870c486d932a1749d7853dd70f436a7e03f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 3 Apr 2015 12:05:28 -0400 Subject: e820, efi: add ACPI 6.0 persistent memory types ACPI 6.0 formalizes e820-type-7 and efi-type-14 as persistent memory. Mark it "reserved" and allow it to be claimed by a persistent memory device driver. This definition is in addition to the Linux kernel's existing type-12 definition that was recently added in support of shipping platforms with NVDIMM support that predate ACPI 6.0 (which now classifies type-12 as OEM reserved). Note, /proc/iomem can be consulted for differentiating legacy "Persistent Memory (legacy)" E820_PRAM vs standard "Persistent Memory" E820_PMEM. Cc: Boaz Harrosh Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jens Axboe Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Thomas Gleixner Acked-by: Jeff Moyer Acked-by: Andy Lutomirski Reviewed-by: Ross Zwisler Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- arch/x86/boot/compressed/eboot.c | 4 ++++ arch/x86/include/uapi/asm/e820.h | 1 + arch/x86/kernel/e820.c | 28 ++++++++++++++++++++++++---- arch/x86/platform/efi/efi.c | 3 +++ 4 files changed, 32 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 48304b89b601..2c82bd150d43 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1224,6 +1224,10 @@ static efi_status_t setup_e820(struct boot_params *params, e820_type = E820_NVS; break; + case EFI_PERSISTENT_MEMORY: + e820_type = E820_PMEM; + break; + default: continue; } diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 960a8a9dc4ab..0f457e6eab18 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -32,6 +32,7 @@ #define E820_ACPI 3 #define E820_NVS 4 #define E820_UNUSABLE 5 +#define E820_PMEM 7 /* * This is a non-standardized way to represent ADR or NVDIMM regions that diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e2ce85db2283..c857d53269dd 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -149,6 +149,7 @@ static void __init e820_print_type(u32 type) case E820_UNUSABLE: printk(KERN_CONT "unusable"); break; + case E820_PMEM: case E820_PRAM: printk(KERN_CONT "persistent (type %u)", type); break; @@ -918,11 +919,32 @@ static inline const char *e820_type_to_string(int e820_type) case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; case E820_UNUSABLE: return "Unusable memory"; - case E820_PRAM: return "Persistent RAM"; + case E820_PRAM: return "Persistent Memory (legacy)"; + case E820_PMEM: return "Persistent Memory"; default: return "reserved"; } } +static bool do_mark_busy(u32 type, struct resource *res) +{ + /* this is the legacy bios/dos rom-shadow + mmio region */ + if (res->start < (1ULL<<20)) + return true; + + /* + * Treat persistent memory like device memory, i.e. reserve it + * for exclusive use of a driver + */ + switch (type) { + case E820_RESERVED: + case E820_PRAM: + case E820_PMEM: + return false; + default: + return true; + } +} + /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -952,9 +974,7 @@ void __init e820_reserve_resources(void) * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (((e820.map[i].type != E820_RESERVED) && - (e820.map[i].type != E820_PRAM)) || - res->start < (1ULL<<20)) { + if (do_mark_busy(e820.map[i].type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 02744df576d5..fe01ae37a2a4 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -153,6 +153,9 @@ static void __init do_add_efi_memmap(void) case EFI_UNUSABLE_MEMORY: e820_type = E820_UNUSABLE; break; + case EFI_PERSISTENT_MEMORY: + e820_type = E820_PMEM; + break; default: /* * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE -- cgit v1.2.1 From 9c27847dda9cfae7c273cde62becf364f9fa9ea3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 27 May 2015 11:09:38 +0930 Subject: kernel/params: constify struct kernel_param_ops uses Most code already uses consts for the struct kernel_param_ops, sweep the kernel for the last offending stragglers. Other than include/linux/moduleparam.h and kernel/params.c all other changes were generated with the following Coccinelle SmPL patch. Merge conflicts between trees can be handled with Coccinelle. In the future git could get Coccinelle merge support to deal with patch --> fail --> grammar --> Coccinelle --> new patch conflicts automatically for us on patches where the grammar is available and the patch is of high confidence. Consider this a feature request. Test compiled on x86_64 against: * allnoconfig * allmodconfig * allyesconfig @ const_found @ identifier ops; @@ const struct kernel_param_ops ops = { }; @ const_not_found depends on !const_found @ identifier ops; @@ -struct kernel_param_ops ops = { +const struct kernel_param_ops ops = { }; Generated-by: Coccinelle SmPL Cc: Rusty Russell Cc: Junio C Hamano Cc: Andrew Morton Cc: Kees Cook Cc: Tejun Heo Cc: Ingo Molnar Cc: cocci@systeme.lip6.fr Cc: linux-kernel@vger.kernel.org Signed-off-by: Luis R. Rodriguez Signed-off-by: Rusty Russell --- arch/x86/kvm/mmu_audit.c | 2 +- arch/x86/platform/uv/uv_nmi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 9ade5cfb5a4c..87393e3ae087 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -291,7 +291,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp) return 0; } -static struct kernel_param_ops audit_param_ops = { +static const struct kernel_param_ops audit_param_ops = { .set = mmu_audit_set, .get = param_get_bool, }; diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 7488cafab955..020c101c255f 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -104,7 +104,7 @@ static int param_set_local64(const char *val, const struct kernel_param *kp) return 0; } -static struct kernel_param_ops param_ops_local64 = { +static const struct kernel_param_ops param_ops_local64 = { .get = param_get_local64, .set = param_set_local64, }; -- cgit v1.2.1 From f36f3f2846b5578d62910ee0b6dbef59fdd1cfa4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2015 13:20:23 +0200 Subject: KVM: add "new" argument to kvm_arch_commit_memory_region This lets the function access the new memory slot without going through kvm_memslots and id_to_memslot. It will simplify the code when more than one address space will be supported. Unfortunately, the "const"ness of the new argument must be casted away in two places. Fixing KVM to accept const struct kvm_memory_slot pointers would require modifications in pretty much all architectures, and is left for later. Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 6 ++++-- arch/x86/kvm/x86.c | 13 +++++-------- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1a4d6a054749..7276107b35df 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -874,7 +874,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 49c34e632b91..1bf2ae9ca521 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4621,10 +4621,12 @@ restart: } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { + /* FIXME: const-ify all uses of struct kvm_memory_slot. */ spin_lock(&kvm->mmu_lock); - slot_handle_leaf(kvm, memslot, kvm_mmu_zap_collapsible_spte, true); + slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, + kvm_mmu_zap_collapsible_spte, true); spin_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f0aec85f8cdd..ba7b0cc52fed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7780,13 +7780,12 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { - struct kvm_memslots *slots; - struct kvm_memory_slot *new; int nr_mmu_pages = 0; - if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { + if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) { int ret; ret = vm_munmap(old->userspace_addr, @@ -7803,10 +7802,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - /* It's OK to get 'new' slot here as it has already been installed */ - slots = kvm_memslots(kvm); - new = id_to_memslot(slots, mem->slot); - /* * Dirty logging tracks sptes in 4k granularity, meaning that large * sptes have to be split. If live migration is successful, the guest @@ -7831,9 +7826,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * been zapped so no dirty logging staff is needed for old slot. For * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the * new and it's also covered when dealing with the new slot. + * + * FIXME: const-ify all uses of struct kvm_memory_slot. */ if (change != KVM_MR_DELETE) - kvm_mmu_slot_apply_flags(kvm, new); + kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); } void kvm_arch_flush_shadow_all(struct kvm *kvm) -- cgit v1.2.1 From d9ef13c2b3983de8dd1373ef670799dbb6498122 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 19 May 2015 16:01:50 +0200 Subject: KVM: pass kvm_memory_slot to gfn_to_page_many_atomic The memory slot is already available from gfn_to_memslot_dirty_bitmap. Isn't it a shame to look it up again? Plus, it makes gfn_to_page_many_atomic agnostic of multiple VCPU address spaces. Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1bf2ae9ca521..6a7e5b6246b1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2728,15 +2728,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, u64 *start, u64 *end) { struct page *pages[PTE_PREFETCH_NUM]; + struct kvm_memory_slot *slot; unsigned access = sp->role.access; int i, ret; gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); + if (!slot) return -1; - ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); + ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); if (ret <= 0) return -1; -- cgit v1.2.1 From 3ed1a4787617f948631a77903453847142271867 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 19 May 2015 16:29:22 +0200 Subject: KVM: x86: pass struct kvm_mmu_page to account/unaccount_shadowed Prepare for multiple address spaces this way, since a VCPU is not available where unaccount_shadowed is called. We will get to the right kvm_memslots struct through the role field in struct kvm_mmu_page. Reviewed-by: Takuya Yoshikawa Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6a7e5b6246b1..deb8862cfd54 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -804,12 +804,14 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void account_shadowed(struct kvm *kvm, gfn_t gfn) +static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; + gfn_t gfn; int i; + gfn = sp->gfn; slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); @@ -818,12 +820,14 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) kvm->arch.indirect_shadow_pages++; } -static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) +static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; + gfn_t gfn; int i; + gfn = sp->gfn; slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); @@ -2131,7 +2135,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) kvm_sync_pages(vcpu, gfn); - account_shadowed(vcpu->kvm, gfn); + account_shadowed(vcpu->kvm, sp); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); @@ -2312,7 +2316,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mmu_unlink_parents(kvm, sp); if (!sp->role.invalid && !sp->role.direct) - unaccount_shadowed(kvm, sp->gfn); + unaccount_shadowed(kvm, sp); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); -- cgit v1.2.1 From 630994b3c798dec3de1fb1d5a3dd9201267036f6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 12 May 2015 22:42:04 -0300 Subject: KVM: x86: add module parameter to disable periodic kvmclock sync The periodic kvmclock sync can be an undesired source of latencies. When running cyclictest on a guest, a latency spike is visible. With kvmclock periodic sync disabled, the spike is gone. Guests should use ntp which means the propagations of ntp corrections from the host clock are unnecessary. v2: -> Make parameter read-only (Radim) -> Return early on kvmclock_sync_fn (Andrew) Reported-and-tested-by: Luiz Capitulino Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ba7b0cc52fed..2211213a84e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -99,6 +99,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); unsigned int min_timer_period_us = 500; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); +static bool __read_mostly kvmclock_periodic_sync = true; +module_param(kvmclock_periodic_sync, bool, S_IRUGO); + bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; @@ -1767,6 +1770,9 @@ static void kvmclock_sync_fn(struct work_struct *work) kvmclock_sync_work); struct kvm *kvm = container_of(ka, struct kvm, arch); + if (!kvmclock_periodic_sync) + return; + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); @@ -7221,6 +7227,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + if (!kvmclock_periodic_sync) + return; + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); } -- cgit v1.2.1 From 257b9a5faab5849fa61f1a523b429dc279d33cc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 22 May 2015 18:45:11 +0200 Subject: KVM: x86: use correct APIC ID on x2APIC transition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SDM April 2015, 10.12.5 State Changes From xAPIC Mode to x2APIC Mode • Any APIC ID value written to the memory-mapped local APIC ID register is not preserved. Fix it by sourcing vcpu_id (= initial APIC ID) instead of memory-mapped APIC ID. Proper use of apic functions would result in two calls to recalculate_apic_map(), so this patch makes a new helper. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dc5b57bb1f76..c2a7d8a7a414 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } +static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) +{ + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + + apic_set_reg(apic, APIC_ID, id << 24); + apic_set_reg(apic, APIC_LDR, ldr); + recalculate_apic_map(apic->vcpu->kvm); +} + static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); @@ -1538,9 +1547,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & X2APIC_ENABLE) { if (value & X2APIC_ENABLE) { - u32 id = kvm_apic_id(apic); - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_apic_set_ldr(apic, ldr); + kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); } else kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); -- cgit v1.2.1 From c028dd6bb648251b773f900f59e53544278b3b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 22 May 2015 19:22:10 +0200 Subject: KVM: x86: preserve x2APIC LDR on INIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Logical x2APIC stops working if we rewrite it with zeros. The best references are SDM April 2015: 10.12.10.1 Logical Destination Mode in x2APIC Mode [...], the LDR are initialized by hardware based on the value of x2APIC ID upon x2APIC state transitions. and SDM April 2015: 10.12.10.2 Deriving Logical x2APIC ID from the Local x2APIC ID The LDR initialization occurs whenever the x2APIC mode is enabled Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c2a7d8a7a414..c789e00dfa8b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1594,7 +1594,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); apic_set_reg(apic, APIC_TASKPRI, 0); - kvm_apic_set_ldr(apic, 0); + if (!apic_x2apic_mode(apic)) + kvm_apic_set_ldr(apic, 0); apic_set_reg(apic, APIC_ESR, 0); apic_set_reg(apic, APIC_ICR, 0); apic_set_reg(apic, APIC_ICR2, 0); -- cgit v1.2.1 From e453aa0f7e7b10135d7a84742722f5fc799a3df9 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 24 May 2015 17:22:38 +0200 Subject: KVM: x86: Allow ARAT CPU feature There is no reason to deny this feature to guests. We are emulating the APIC timer, thus are exposing it without stops in power-saving states. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 92a74a0428aa..9dadf8d67873 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -415,6 +415,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 6: /* Thermal management */ + entry->eax = 0x4; /* allow ARAT */ + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; case 7: { entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ @@ -591,7 +597,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ - case 6: /* Thermal management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: -- cgit v1.2.1 From 4141259b56cbd5a8614e487366909a02d702b9dd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 27 May 2015 11:53:06 +0200 Subject: arch/x86/kvm/mmu.c: work around gcc-4.4.4 bug arch/x86/kvm/mmu.c: In function 'kvm_mmu_pte_write': arch/x86/kvm/mmu.c:4256: error: unknown field 'cr0_wp' specified in initializer arch/x86/kvm/mmu.c:4257: error: unknown field 'cr4_pae' specified in initializer arch/x86/kvm/mmu.c:4257: warning: excess elements in union initializer ... gcc-4.4.4 (at least) has issues when using anonymous unions in initializers. Fixes: edc90b7dc4ceef6 ("KVM: MMU: fix SMAP virtualization") Cc: Xiao Guangrong Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index deb8862cfd54..a65ce12470f8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4258,13 +4258,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush, zap_page; - union kvm_mmu_page_role mask = (union kvm_mmu_page_role) { - .cr0_wp = 1, - .cr4_pae = 1, - .nxe = 1, - .smep_andnot_wp = 1, - .smap_andnot_wp = 1, - }; + union kvm_mmu_page_role mask = { }; + + mask.cr0_wp = 1; + mask.cr4_pae = 1; + mask.nxe = 1; + mask.smep_andnot_wp = 1; + mask.smap_andnot_wp = 1; /* * If we don't have indirect shadow pages, it means no page is -- cgit v1.2.1 From 1e6277de3a23373b89e0affc3d179f2173b857a4 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 28 May 2015 09:29:27 +0100 Subject: x86/mm: Mark arch_ioremap_p{m,u}d_supported() __init ... as their only caller is. Signed-off-by: Jan Beulich Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5566EE07020000780007E683@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 82d63ed70045..b0da3588b452 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -365,7 +365,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int arch_ioremap_pud_supported(void) +int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 return cpu_has_gbpages; @@ -374,7 +374,7 @@ int arch_ioremap_pud_supported(void) #endif } -int arch_ioremap_pmd_supported(void) +int __init arch_ioremap_pmd_supported(void) { return cpu_has_pse; } -- cgit v1.2.1 From b01aec9b2c7d32f17a37553df63efa9f7c0fdaa0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 21 May 2015 19:59:31 +0200 Subject: EDAC: Cleanup atomic_scrub mess So first of all, this atomic_scrub() function's naming is bad. It looks like an atomic_t helper. Change it to edac_atomic_scrub(). The bigger problem is that this function is arch-specific and every new arch which doesn't necessarily need that functionality still needs to define it, otherwise EDAC doesn't compile. So instead of doing that and including arch-specific headers, have each arch define an EDAC_ATOMIC_SCRUB symbol which can be used in edac_mc.c for ifdeffery. Much cleaner. And we already are doing this with another symbol - EDAC_SUPPORT. This is also much cleaner than having CONFIG_EDAC enumerate all the arches which need/have EDAC support and drivers. This way I can kill the useless edac.h header in tile too. Acked-by: Ralf Baechle Acked-by: Michael Ellerman Acked-by: Chris Metcalf Acked-by: Ingo Molnar Acked-by: Russell King Cc: Benjamin Herrenschmidt Cc: Doug Thompson Cc: linux-arm-kernel@lists.infradead.org Cc: linux-edac@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linuxppc-dev@lists.ozlabs.org Cc: "Maciej W. Rozycki" Cc: Markos Chandras Cc: Mauro Carvalho Chehab Cc: Paul Mackerras Cc: "Steven J. Hill" Cc: x86@kernel.org Signed-off-by: Borislav Petkov --- arch/x86/Kconfig | 2 ++ arch/x86/include/asm/edac.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..482c160a9fe9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -143,6 +143,8 @@ config X86 select ACPI_LEGACY_TABLES_LOOKUP if ACPI select X86_FEATURE_NAMES if PROC_FS select SRCU + select EDAC_SUPPORT + select EDAC_ATOMIC_SCRUB config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h index e9b57ecc70c5..cf8fdf83b231 100644 --- a/arch/x86/include/asm/edac.h +++ b/arch/x86/include/asm/edac.h @@ -3,7 +3,7 @@ /* ECC atomic, DMA, SMP and interrupt safe scrub function */ -static inline void atomic_scrub(void *va, u32 size) +static inline void edac_atomic_scrub(void *va, u32 size) { u32 i, *virt_addr = va; -- cgit v1.2.1 From 7ba554b5ac69e5f3edac9ce3233beb5acd480878 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 28 May 2015 09:16:45 +0100 Subject: x86/asm/entry/32: Really make user_mode() work correctly for VM86 mode While commit efa7045103 ("x86/asm/entry: Make user_mode() work correctly if regs came from VM86 mode") claims that "user_mode() is now identical to user_mode_vm()", this wasn't actually the case - no prior commit made it so. Signed-off-by: Jan Beulich Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5566EB0D020000780007E655@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 19507ffa5d28..5fabf1362942 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -107,7 +107,7 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) static inline int user_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 - return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL; + return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL; #else return !!(regs->cs & 3); #endif -- cgit v1.2.1 From 927392d73a97d8d235bb65400e2e3c7f0bec2b6f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 23 Nov 2012 19:19:07 +0100 Subject: x86/boot: Add CONFIG_PARAVIRT_SPINLOCKS quirk to arch/x86/boot/compressed/misc.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linus reported the following new warning on x86 allmodconfig with GCC 5.1: > ./arch/x86/include/asm/spinlock.h: In function ‘arch_spin_lock’: > ./arch/x86/include/asm/spinlock.h:119:3: warning: implicit declaration > of function ‘__ticket_lock_spinning’ [-Wimplicit-function-declaration] > __ticket_lock_spinning(lock, inc.tail); > ^ This warning triggers because of these hacks in misc.h: /* * we have to be careful, because no indirections are allowed here, and * paravirt_ops is a kind of one. As it will only run in baremetal anyway, * we just keep it from happening */ #undef CONFIG_PARAVIRT #undef CONFIG_KASAN But these hacks were not updated when CONFIG_PARAVIRT_SPINLOCKS was added, and eventually (with the introduction of queued paravirt spinlocks in recent kernels) this created an invalid Kconfig combination and broke the build. So add a CONFIG_PARAVIRT_SPINLOCKS #undef line as well. Also remove the _ASM_X86_DESC_H quirk: that undocumented quirk was originally added ages ago, in: 099e1377269a ("x86: use ELF format in compressed images.") and I went back to that kernel (and fixed up the main Makefile which didn't build anymore) and checked what failure it avoided: it avoided an include file dependencies related build failure related to our old x86-platforms code. That old code is long gone, the header dependencies got cleaned up, and the build does not fail anymore with the totality of asm/desc.h included - so remove the quirk. Reported-by: Linus Torvalds Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 89dd0d78013a..805d25ca5f1d 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -2,15 +2,14 @@ #define BOOT_COMPRESSED_MISC_H /* - * we have to be careful, because no indirections are allowed here, and - * paravirt_ops is a kind of one. As it will only run in baremetal anyway, - * we just keep it from happening + * Special hack: we have to be careful, because no indirections are allowed here, + * and paravirt_ops is a kind of one. As it will only run in baremetal anyway, + * we just keep it from happening. (This list needs to be extended when new + * paravirt and debugging variants are added.) */ #undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN -#ifdef CONFIG_X86_32 -#define _ASM_X86_DESC_H 1 -#endif #include #include -- cgit v1.2.1 From 611917258f88fb2fa41d1c2826b193f7c3885b54 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 28 May 2015 20:20:39 -0300 Subject: x86: kvmclock: add flag to indicate pvclock counts from zero Setting sched clock stable for kvmclock causes the printk timestamps to not start from zero, which is different from baremetal and can possibly break userspace. Add a flag to indicate that hypervisor sets clock base at zero when kvmclock is initialized. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pvclock-abi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 6167fd798188..655e07a48f6c 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -41,5 +41,6 @@ struct pvclock_wall_clock { #define PVCLOCK_TSC_STABLE_BIT (1 << 0) #define PVCLOCK_GUEST_STOPPED (1 << 1) +#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ -- cgit v1.2.1 From 0ad83caa21bff9f383c10e73b452425709573042 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Thu, 28 May 2015 20:20:40 -0300 Subject: x86: kvmclock: set scheduler clock stable If you try to enable NOHZ_FULL on a guest today, you'll get the following error when the guest tries to deactivate the scheduler tick: WARNING: CPU: 3 PID: 2182 at kernel/time/tick-sched.c:192 can_stop_full_tick+0xb9/0x290() NO_HZ FULL will not work with unstable sched clock CPU: 3 PID: 2182 Comm: kworker/3:1 Not tainted 4.0.0-10545-gb9bb6fb #204 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Workqueue: events flush_to_ldisc ffffffff8162a0c7 ffff88011f583e88 ffffffff814e6ba0 0000000000000002 ffff88011f583ed8 ffff88011f583ec8 ffffffff8104d095 ffff88011f583eb8 0000000000000000 0000000000000003 0000000000000001 0000000000000001 Call Trace: [] dump_stack+0x4f/0x7b [] warn_slowpath_common+0x85/0xc0 [] warn_slowpath_fmt+0x46/0x50 [] can_stop_full_tick+0xb9/0x290 [] tick_nohz_irq_exit+0x8d/0xb0 [] irq_exit+0xc5/0x130 [] smp_apic_timer_interrupt+0x4a/0x60 [] apic_timer_interrupt+0x6e/0x80 [] ? _raw_spin_unlock_irqrestore+0x31/0x60 [] __wake_up+0x48/0x60 [] n_tty_receive_buf_common+0x49c/0xba0 [] ? tty_ldisc_ref+0x1f/0x70 [] n_tty_receive_buf2+0x14/0x20 [] flush_to_ldisc+0xe0/0x120 [] process_one_work+0x1d5/0x540 [] ? process_one_work+0x151/0x540 [] worker_thread+0x121/0x470 [] ? process_one_work+0x540/0x540 [] kthread+0xef/0x110 [] ? __kthread_parkme+0xa0/0xa0 [] ret_from_fork+0x42/0x70 [] ? __kthread_parkme+0xa0/0xa0 ---[ end trace 06e3507544a38866 ]--- However, it turns out that kvmclock does provide a stable sched_clock callback. So, let the scheduler know this which in turn makes NOHZ_FULL work in the guest. Signed-off-by: Marcelo Tosatti Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 42caaef897c8..49487b488061 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -217,8 +218,10 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { + struct pvclock_vcpu_time_info *vcpu_time; unsigned long mem; - int size; + int size, cpu; + u8 flags; size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); @@ -264,7 +267,14 @@ void __init kvmclock_init(void) pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + pvclock_set_flags(~0); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + if (flags & PVCLOCK_COUNTS_FROM_ZERO) + set_sched_clock_stable(); + put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void) -- cgit v1.2.1 From b7e60c5aedd2b63f16ef06fde4f81ca032211bc5 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 28 May 2015 20:20:41 -0300 Subject: KVM: x86: zero kvmclock_offset when vcpu0 initializes kvmclock system MSR Initialize kvmclock base, on kvmclock system MSR write time, so that the guest sees kvmclock counting from zero. This matches baremetal behaviour when kvmclock in guest sets sched clock stable. Signed-off-by: Marcelo Tosatti [Remove unnecessary comment. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2211213a84e7..79dde1656db6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1700,6 +1700,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->pvclock_set_guest_stopped_request = false; } + pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO; + /* If the host uses TSC clocksource, then it is stable */ if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; @@ -2282,6 +2284,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &vcpu->requests); ka->boot_vcpu_runs_old_kvmclock = tmp; + + ka->kvmclock_offset = -get_kernel_ns(); } vcpu->arch.time = data; -- cgit v1.2.1 From 2779567346fc229fc286d0793f6f5e41f67aafe9 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 12 May 2015 00:07:07 +0200 Subject: um: Cleanup mem_32/64.c headers Include only headers we really need. Signed-off-by: Richard Weinberger --- arch/x86/um/mem_32.c | 3 +-- arch/x86/um/mem_64.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index f40281e5d6a2..744afdc18cf3 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -7,8 +7,7 @@ */ #include -#include -#include +#include static struct vm_area_struct gate_vma; diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c index f8fecaddcc0d..7642e2e2aa61 100644 --- a/arch/x86/um/mem_64.c +++ b/arch/x86/um/mem_64.c @@ -1,6 +1,5 @@ #include -#include -#include +#include const char *arch_vma_name(struct vm_area_struct *vma) { -- cgit v1.2.1 From 9b46e2128e2506d88aa81c3c17eec42a5792089c Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 12 May 2015 00:09:52 +0200 Subject: um: Add asm/elf.h to vma.c um_vdso_addr is defined in asm/elf.h. Signed-off-by: Richard Weinberger --- arch/x86/um/vdso/vma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 916cda4cd5b4..237c6831e095 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -10,6 +10,7 @@ #include #include #include +#include #include static unsigned int __read_mostly vdso_enabled = 1; -- cgit v1.2.1 From 7017f42f3a6c3222a6f2c26ca44ad07a958cd78d Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 12 May 2015 00:13:05 +0200 Subject: um: Add uaccess.h to syscalls_64.c It's using put_user() and needs this header. Signed-off-by: Richard Weinberger --- arch/x86/um/syscalls_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index adb08eb5c22a..e6552275320b 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -6,6 +6,7 @@ */ #include +#include #include /* XXX This should get the constants from libc */ #include -- cgit v1.2.1 From 1d04c8d77934d9406cc8219d97be2f0188f502d2 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 12 May 2015 00:14:39 +0200 Subject: um: Add uaccess.h to ldt.c ...for userspace memory access. Signed-off-by: Richard Weinberger --- arch/x86/um/ldt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 5c0b711d2433..9701a4fd7bf2 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.1 From f8d65d27e677da0ce33de570e3068308a77ed2b1 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 12 May 2015 00:17:28 +0200 Subject: um: Rework uaccess code Rework UML's uaccess code to reuse as much as possible from asm-generic/uaccess.c. Signed-off-by: Richard Weinberger --- arch/x86/um/asm/checksum.h | 1 + arch/x86/um/asm/elf.h | 2 -- arch/x86/um/asm/processor.h | 2 ++ arch/x86/um/asm/segment.h | 8 ++++++++ 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index 4b181b74454f..ee940185e89f 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -3,6 +3,7 @@ #include #include +#include /* * computes the checksum of a memory block at buff, length len, diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 0a656b727b1a..548197212a45 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -200,8 +200,6 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG]; typedef struct user_i387_struct elf_fpregset_t; -#define task_pt_regs(t) (&(t)->thread.regs) - struct task_struct; extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 2a206d2b14ab..233ee09c1ce8 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -28,6 +28,8 @@ static inline void rep_nop(void) #define cpu_relax() rep_nop() #define cpu_relax_lowlatency() cpu_relax() +#define task_pt_regs(t) (&(t)->thread.regs) + #include #endif diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h index 45183fcd10b6..41dd5e1f3cd7 100644 --- a/arch/x86/um/asm/segment.h +++ b/arch/x86/um/asm/segment.h @@ -7,4 +7,12 @@ extern int host_gdt_entry_tls_min; #define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +typedef struct { + unsigned long seg; +} mm_segment_t; + +#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) +#define KERNEL_DS MAKE_MM_SEG(~0UL) +#define USER_DS MAKE_MM_SEG(TASK_SIZE) + #endif -- cgit v1.2.1 From 6c684465587aab2a0d2712ee755c0164fa33efd1 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 13 May 2015 23:02:14 +0200 Subject: um: Fix warning in setup_signal_stack_si() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: arch/x86/um/signal.c: In function ‘setup_signal_stack_si’: include/asm-generic/uaccess.h:146:27: warning: initialization from incompatible pointer type [enabled by default] __typeof__(*(ptr)) __x = (x); \ ^ arch/x86/um/signal.c:544:10: note: in expansion of macro ‘__put_user’ err |= __put_user(ksig->ka.sa.sa_restorer, Signed-off-by: Richard Weinberger --- arch/x86/um/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 592491d1d70d..06934a8a4872 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -541,7 +541,8 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, */ /* x86-64 should always use SA_RESTORER. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) - err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode); + err |= __put_user((void *)ksig->ka.sa.sa_restorer, + &frame->pretcode); else /* could use a vstub here */ return err; -- cgit v1.2.1 From 298e20ba8c197e8d429a6c8671550c41c7919033 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 31 May 2015 19:50:57 +0200 Subject: um: Stop abusing __KERNEL__ Currently UML is abusing __KERNEL__ to distinguish between kernel and host code (os-Linux). It is better to use a custom define such that existing users of __KERNEL__ don't get confused. Signed-off-by: Richard Weinberger --- arch/x86/um/shared/sysdep/tls.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h index 27cce00c6b30..a682db13df23 100644 --- a/arch/x86/um/shared/sysdep/tls.h +++ b/arch/x86/um/shared/sysdep/tls.h @@ -1,7 +1,7 @@ #ifndef _SYSDEP_TLS_H #define _SYSDEP_TLS_H -# ifndef __KERNEL__ +#ifdef __UM_HOST__ /* Change name to avoid conflicts with the original one from , which * may be named user_desc (but in 2.4 and in header matching its API was named @@ -22,11 +22,11 @@ typedef struct um_dup_user_desc { #endif } user_desc_t; -# else /* __KERNEL__ */ +#else /* __UM_HOST__ */ typedef struct user_desc user_desc_t; -# endif /* __KERNEL__ */ +#endif /* __UM_HOST__ */ extern int os_set_thread_area(user_desc_t *info, int pid); extern int os_get_thread_area(user_desc_t *info, int pid); -- cgit v1.2.1 From 131484c8da97ed600c18dd9d03b661e8ae052df6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 May 2015 12:21:47 +0200 Subject: x86/debug: Remove perpetually broken, unmaintainable dwarf annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So the dwarf2 annotations in low level assembly code have become an increasing hindrance: unreadable, messy macros mixed into some of the most security sensitive code paths of the Linux kernel. These debug info annotations don't even buy the upstream kernel anything: dwarf driven stack unwinding has caused problems in the past so it's out of tree, and the upstream kernel only uses the much more robust framepointers based stack unwinding method. In addition to that there's a steady, slow bitrot going on with these annotations, requiring frequent fixups. There's no tooling and no functionality upstream that keeps it correct. So burn down the sick forest, allowing new, healthier growth: 27 files changed, 350 insertions(+), 1101 deletions(-) Someone who has the willingness and time to do this properly can attempt to reintroduce dwarf debuginfo in x86 assembly code plus dwarf unwinding from first principles, with the following conditions: - it should be maximally readable, and maximally low-key to 'ordinary' code reading and maintenance. - find a build time method to insert dwarf annotations automatically in the most common cases, for pop/push instructions that manipulate the stack pointer. This could be done for example via a preprocessing step that just looks for common patterns - plus special annotations for the few cases where we want to depart from the default. We have hundreds of CFI annotations, so automating most of that makes sense. - it should come with build tooling checks that ensure that CFI annotations are sensible. We've seen such efforts from the framepointer side, and there's no reason it couldn't be done on the dwarf side. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frédéric Weisbecker Cc: Jan Beulich Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 10 +- arch/x86/ia32/ia32entry.S | 133 ++++----------- arch/x86/include/asm/calling.h | 94 +++++------ arch/x86/include/asm/dwarf2.h | 170 ------------------- arch/x86/include/asm/frame.h | 7 +- arch/x86/kernel/entry_32.S | 368 ++++++++++++----------------------------- arch/x86/kernel/entry_64.S | 288 ++++++-------------------------- arch/x86/lib/atomic64_386_32.S | 7 +- arch/x86/lib/atomic64_cx8_32.S | 61 +++---- arch/x86/lib/checksum_32.S | 52 +++--- arch/x86/lib/clear_page_64.S | 7 - arch/x86/lib/cmpxchg16b_emu.S | 12 +- arch/x86/lib/cmpxchg8b_emu.S | 11 +- arch/x86/lib/copy_page_64.S | 11 -- arch/x86/lib/copy_user_64.S | 15 -- arch/x86/lib/csum-copy_64.S | 17 -- arch/x86/lib/getuser.S | 13 -- arch/x86/lib/iomap_copy_64.S | 3 - arch/x86/lib/memcpy_64.S | 3 - arch/x86/lib/memmove_64.S | 3 - arch/x86/lib/memset_64.S | 5 - arch/x86/lib/msr-reg.S | 44 ++--- arch/x86/lib/putuser.S | 8 +- arch/x86/lib/rwsem.S | 49 +++--- arch/x86/lib/thunk_32.S | 15 +- arch/x86/lib/thunk_64.S | 44 +++-- arch/x86/net/bpf_jit.S | 1 - 27 files changed, 350 insertions(+), 1101 deletions(-) delete mode 100644 arch/x86/include/asm/dwarf2.h (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 57996ee840dd..43e8328a23e4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -149,12 +149,6 @@ endif sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp -# do binutils support CFI? -cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) -# is .cfi_signal_frame supported too? -cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) -cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) - # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) @@ -162,8 +156,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 63450a596800..2be23c734db5 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -4,7 +4,6 @@ * Copyright 2000-2002 Andi Kleen, SuSE Labs. */ -#include #include #include #include @@ -60,17 +59,6 @@ movl %eax,%eax /* zero extension */ .endm - .macro CFI_STARTPROC32 simple - CFI_STARTPROC \simple - CFI_UNDEFINED r8 - CFI_UNDEFINED r9 - CFI_UNDEFINED r10 - CFI_UNDEFINED r11 - CFI_UNDEFINED r12 - CFI_UNDEFINED r13 - CFI_UNDEFINED r14 - CFI_UNDEFINED r15 - .endm #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret32) @@ -102,11 +90,6 @@ ENDPROC(native_usergs_sysret32) * with the int 0x80 path. */ ENTRY(ia32_sysenter_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rsp,rbp - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -121,25 +104,21 @@ ENTRY(ia32_sysenter_target) movl %eax, %eax movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - CFI_REGISTER rip,r10 /* Construct struct pt_regs on stack */ - pushq_cfi $__USER32_DS /* pt_regs->ss */ - pushq_cfi %rbp /* pt_regs->sp */ - CFI_REL_OFFSET rsp,0 - pushfq_cfi /* pt_regs->flags */ - pushq_cfi $__USER32_CS /* pt_regs->cs */ - pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ cld sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 /* * no need to do an access_ok check here because rbp has been @@ -161,8 +140,8 @@ sysenter_flags_fixed: orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - CFI_REMEMBER_STATE jnz sysenter_tracesys + sysenter_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -193,14 +172,12 @@ sysexit_from_sys_call: */ andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RIP(%rsp),%ecx /* User %eip */ - CFI_REGISTER rip,rcx RESTORE_RSI_RDI xorl %edx,%edx /* avoid info leaks */ xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 movl EFLAGS(%rsp),%r11d /* User eflags */ - /*CFI_RESTORE rflags*/ TRACE_IRQS_ON /* @@ -231,8 +208,6 @@ sysexit_from_sys_call: */ USERGS_SYSRET32 - CFI_RESTORE_STATE - #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common movl %esi,%r8d /* 5th arg: 4th syscall arg */ @@ -282,8 +257,8 @@ sysexit_audit: #endif sysenter_fix_flags: - pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq_cfi + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq jmp sysenter_flags_fixed sysenter_tracesys: @@ -298,7 +273,6 @@ sysenter_tracesys: LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS jmp sysenter_do_call - CFI_ENDPROC ENDPROC(ia32_sysenter_target) /* @@ -332,12 +306,6 @@ ENDPROC(ia32_sysenter_target) * with the int 0x80 path. */ ENTRY(ia32_cstar_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -345,7 +313,6 @@ ENTRY(ia32_cstar_target) */ SWAPGS_UNSAFE_STACK movl %esp,%r8d - CFI_REGISTER rsp,r8 movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp ENABLE_INTERRUPTS(CLBR_NONE) @@ -353,22 +320,19 @@ ENTRY(ia32_cstar_target) movl %eax,%eax /* Construct struct pt_regs on stack */ - pushq_cfi $__USER32_DS /* pt_regs->ss */ - pushq_cfi %r8 /* pt_regs->sp */ - CFI_REL_OFFSET rsp,0 - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER32_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rbp /* pt_regs->cx */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ movl %ebp,%ecx - pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq $-ENOSYS /* pt_regs->ax */ sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 /* * no need to do an access_ok check here because r8 has been @@ -380,8 +344,8 @@ ENTRY(ia32_cstar_target) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - CFI_REMEMBER_STATE jnz cstar_tracesys + cstar_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -403,15 +367,12 @@ sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) RESTORE_RSI_RDI_RDX movl RIP(%rsp),%ecx - CFI_REGISTER rip,rcx movl EFLAGS(%rsp),%r11d - /*CFI_REGISTER rflags,r11*/ xorq %r10,%r10 xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON movl RSP(%rsp),%esp - CFI_RESTORE rsp /* * 64bit->32bit SYSRET restores eip from ecx, * eflags from r11 (but RF and VM bits are forced to 0), @@ -430,7 +391,6 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: - CFI_RESTORE_STATE movl %r9d,R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common movl R9(%rsp),%r9d /* reload 6th syscall arg */ @@ -460,7 +420,6 @@ ia32_badarg: ASM_CLAC movq $-EFAULT,%rax jmp ia32_sysret - CFI_ENDPROC /* * Emulated IA32 system calls via int 0x80. @@ -484,15 +443,6 @@ ia32_badarg: */ ENTRY(ia32_syscall) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,5*8 - /*CFI_REL_OFFSET ss,4*8 */ - CFI_REL_OFFSET rsp,3*8 - /*CFI_REL_OFFSET rflags,2*8 */ - /*CFI_REL_OFFSET cs,1*8 */ - CFI_REL_OFFSET rip,0*8 - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -506,15 +456,14 @@ ENTRY(ia32_syscall) movl %eax,%eax /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ cld sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -544,7 +493,6 @@ ia32_tracesys: LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS jmp ia32_do_call - CFI_ENDPROC END(ia32_syscall) .macro PTREGSCALL label, func @@ -554,8 +502,6 @@ GLOBAL(\label) jmp ia32_ptregs_common .endm - CFI_STARTPROC32 - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn PTREGSCALL stub32_sigreturn, sys32_sigreturn PTREGSCALL stub32_fork, sys_fork @@ -569,23 +515,8 @@ GLOBAL(stub32_clone) ALIGN ia32_ptregs_common: - CFI_ENDPROC - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SIZEOF_PTREGS - CFI_REL_OFFSET rax,RAX - CFI_REL_OFFSET rcx,RCX - CFI_REL_OFFSET rdx,RDX - CFI_REL_OFFSET rsi,RSI - CFI_REL_OFFSET rdi,RDI - CFI_REL_OFFSET rip,RIP -/* CFI_REL_OFFSET cs,CS*/ -/* CFI_REL_OFFSET rflags,EFLAGS*/ - CFI_REL_OFFSET rsp,RSP -/* CFI_REL_OFFSET ss,SS*/ SAVE_EXTRA_REGS 8 call *%rax RESTORE_EXTRA_REGS 8 ret - CFI_ENDPROC END(ia32_ptregs_common) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 1c8b50edb2db..0d76accde45b 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -46,8 +46,6 @@ For 32-bit we have the following conventions - kernel is built with */ -#include - #ifdef CONFIG_X86_64 /* @@ -92,27 +90,26 @@ For 32-bit we have the following conventions - kernel is built with .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 subq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 15*8+\addskip .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 .if \r11 - movq_cfi r11, 6*8+\offset + movq %r11, 6*8+\offset(%rsp) .endif .if \r8910 - movq_cfi r10, 7*8+\offset - movq_cfi r9, 8*8+\offset - movq_cfi r8, 9*8+\offset + movq %r10, 7*8+\offset(%rsp) + movq %r9, 8*8+\offset(%rsp) + movq %r8, 9*8+\offset(%rsp) .endif .if \rax - movq_cfi rax, 10*8+\offset + movq %rax, 10*8+\offset(%rsp) .endif .if \rcx - movq_cfi rcx, 11*8+\offset + movq %rcx, 11*8+\offset(%rsp) .endif - movq_cfi rdx, 12*8+\offset - movq_cfi rsi, 13*8+\offset - movq_cfi rdi, 14*8+\offset + movq %rdx, 12*8+\offset(%rsp) + movq %rsi, 13*8+\offset(%rsp) + movq %rdi, 14*8+\offset(%rsp) .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -131,24 +128,24 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_EXTRA_REGS offset=0 - movq_cfi r15, 0*8+\offset - movq_cfi r14, 1*8+\offset - movq_cfi r13, 2*8+\offset - movq_cfi r12, 3*8+\offset - movq_cfi rbp, 4*8+\offset - movq_cfi rbx, 5*8+\offset + movq %r15, 0*8+\offset(%rsp) + movq %r14, 1*8+\offset(%rsp) + movq %r13, 2*8+\offset(%rsp) + movq %r12, 3*8+\offset(%rsp) + movq %rbp, 4*8+\offset(%rsp) + movq %rbx, 5*8+\offset(%rsp) .endm .macro SAVE_EXTRA_REGS_RBP offset=0 - movq_cfi rbp, 4*8+\offset + movq %rbp, 4*8+\offset(%rsp) .endm .macro RESTORE_EXTRA_REGS offset=0 - movq_cfi_restore 0*8+\offset, r15 - movq_cfi_restore 1*8+\offset, r14 - movq_cfi_restore 2*8+\offset, r13 - movq_cfi_restore 3*8+\offset, r12 - movq_cfi_restore 4*8+\offset, rbp - movq_cfi_restore 5*8+\offset, rbx + movq 0*8+\offset(%rsp), %r15 + movq 1*8+\offset(%rsp), %r14 + movq 2*8+\offset(%rsp), %r13 + movq 3*8+\offset(%rsp), %r12 + movq 4*8+\offset(%rsp), %rbp + movq 5*8+\offset(%rsp), %rbx .endm .macro ZERO_EXTRA_REGS @@ -162,24 +159,24 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 6*8, r11 + movq 6*8(%rsp), %r11 .endif .if \rstor_r8910 - movq_cfi_restore 7*8, r10 - movq_cfi_restore 8*8, r9 - movq_cfi_restore 9*8, r8 + movq 7*8(%rsp), %r10 + movq 8*8(%rsp), %r9 + movq 9*8(%rsp), %r8 .endif .if \rstor_rax - movq_cfi_restore 10*8, rax + movq 10*8(%rsp), %rax .endif .if \rstor_rcx - movq_cfi_restore 11*8, rcx + movq 11*8(%rsp), %rcx .endif .if \rstor_rdx - movq_cfi_restore 12*8, rdx + movq 12*8(%rsp), %rdx .endif - movq_cfi_restore 13*8, rsi - movq_cfi_restore 14*8, rdi + movq 13*8(%rsp), %rsi + movq 14*8(%rsp), %rdi .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 @@ -205,7 +202,6 @@ For 32-bit we have the following conventions - kernel is built with .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 addq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) .endm .macro icebp @@ -224,23 +220,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx .endm .macro RESTORE_ALL - popl_cfi_reg ebx - popl_cfi_reg ecx - popl_cfi_reg edx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebp - popl_cfi_reg eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h deleted file mode 100644 index de1cdaf4d743..000000000000 --- a/arch/x86/include/asm/dwarf2.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef _ASM_X86_DWARF2_H -#define _ASM_X86_DWARF2_H - -#ifndef __ASSEMBLY__ -#warning "asm/dwarf2.h should be only included in pure assembly files" -#endif - -/* - * Macros for dwarf2 CFI unwind table entries. - * See "as.info" for details on these pseudo ops. Unfortunately - * they are only supported in very new binutils, so define them - * away for older version. - */ - -#ifdef CONFIG_AS_CFI - -#define CFI_STARTPROC .cfi_startproc -#define CFI_ENDPROC .cfi_endproc -#define CFI_DEF_CFA .cfi_def_cfa -#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register -#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset -#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset -#define CFI_OFFSET .cfi_offset -#define CFI_REL_OFFSET .cfi_rel_offset -#define CFI_REGISTER .cfi_register -#define CFI_RESTORE .cfi_restore -#define CFI_REMEMBER_STATE .cfi_remember_state -#define CFI_RESTORE_STATE .cfi_restore_state -#define CFI_UNDEFINED .cfi_undefined -#define CFI_ESCAPE .cfi_escape - -#ifdef CONFIG_AS_CFI_SIGNAL_FRAME -#define CFI_SIGNAL_FRAME .cfi_signal_frame -#else -#define CFI_SIGNAL_FRAME -#endif - -#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) - /* - * Emit CFI data in .debug_frame sections, not .eh_frame sections. - * The latter we currently just discard since we don't do DWARF - * unwinding at runtime. So only the offline DWARF information is - * useful to anyone. Note we should not use this directive if this - * file is used in the vDSO assembly, or if vmlinux.lds.S gets - * changed so it doesn't discard .eh_frame. - */ - .cfi_sections .debug_frame -#endif - -#else - -/* - * Due to the structure of pre-exisiting code, don't use assembler line - * comment character # to ignore the arguments. Instead, use a dummy macro. - */ -.macro cfi_ignore a=0, b=0, c=0, d=0 -.endm - -#define CFI_STARTPROC cfi_ignore -#define CFI_ENDPROC cfi_ignore -#define CFI_DEF_CFA cfi_ignore -#define CFI_DEF_CFA_REGISTER cfi_ignore -#define CFI_DEF_CFA_OFFSET cfi_ignore -#define CFI_ADJUST_CFA_OFFSET cfi_ignore -#define CFI_OFFSET cfi_ignore -#define CFI_REL_OFFSET cfi_ignore -#define CFI_REGISTER cfi_ignore -#define CFI_RESTORE cfi_ignore -#define CFI_REMEMBER_STATE cfi_ignore -#define CFI_RESTORE_STATE cfi_ignore -#define CFI_UNDEFINED cfi_ignore -#define CFI_ESCAPE cfi_ignore -#define CFI_SIGNAL_FRAME cfi_ignore - -#endif - -/* - * An attempt to make CFI annotations more or less - * correct and shorter. It is implied that you know - * what you're doing if you use them. - */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_64 - .macro pushq_cfi reg - pushq \reg - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro pushq_cfi_reg reg - pushq %\reg - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popq_cfi reg - popq \reg - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro popq_cfi_reg reg - popq %\reg - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE \reg - .endm - - .macro pushfq_cfi - pushfq - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro popfq_cfi - popfq - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro movq_cfi reg offset=0 - movq %\reg, \offset(%rsp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movq_cfi_restore offset reg - movq \offset(%rsp), %\reg - CFI_RESTORE \reg - .endm -#else /*!CONFIG_X86_64*/ - .macro pushl_cfi reg - pushl \reg - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro pushl_cfi_reg reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popl_cfi reg - popl \reg - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro popl_cfi_reg reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE \reg - .endm - - .macro pushfl_cfi - pushfl - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro popfl_cfi - popfl - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro movl_cfi reg offset=0 - movl %\reg, \offset(%esp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movl_cfi_restore offset reg - movl \offset(%esp), %\reg - CFI_RESTORE \reg - .endm -#endif /*!CONFIG_X86_64*/ -#endif /*__ASSEMBLY__*/ - -#endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 3b629f47eb65..793179cf8e21 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,20 +1,17 @@ #ifdef __ASSEMBLY__ #include -#include /* The annotation hides the frame from the unwinder and makes it look like a ordinary ebp save/restore. This avoids some special cases for frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - __ASM_SIZE(push,_cfi) %__ASM_REG(bp) - CFI_REL_OFFSET __ASM_REG(bp), 0 + __ASM_SIZE(push,) %__ASM_REG(bp) __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) .endm .macro ENDFRAME - __ASM_SIZE(pop,_cfi) %__ASM_REG(bp) - CFI_RESTORE __ASM_REG(bp) + __ASM_SIZE(pop,) %__ASM_REG(bp) .endm #else .macro FRAME diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 1c309763e321..0ac73de925d1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -113,11 +112,10 @@ /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl_cfi $0 + pushl $0 .endm .macro POP_GS pop=0 addl $(4 + \pop), %esp - CFI_ADJUST_CFA_OFFSET -(4 + \pop) .endm .macro POP_GS_EX .endm @@ -137,16 +135,13 @@ #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl_cfi %gs - /*CFI_REL_OFFSET gs, 0*/ + pushl %gs .endm .macro POP_GS pop=0 -98: popl_cfi %gs - /*CFI_RESTORE gs*/ +98: popl %gs .if \pop <> 0 add $\pop, %esp - CFI_ADJUST_CFA_OFFSET -\pop .endif .endm .macro POP_GS_EX @@ -170,11 +165,9 @@ .macro GS_TO_REG reg movl %gs, \reg - /*CFI_REGISTER gs, \reg*/ .endm .macro REG_TO_PTGS reg movl \reg, PT_GS(%esp) - /*CFI_REL_OFFSET gs, PT_GS*/ .endm .macro SET_KERNEL_GS reg movl $(__KERNEL_STACK_CANARY), \reg @@ -186,26 +179,16 @@ .macro SAVE_ALL cld PUSH_GS - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0;*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0;*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0;*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es @@ -215,30 +198,20 @@ .endm .macro RESTORE_INT_REGS - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl_cfi %ds - /*CFI_RESTORE ds;*/ -2: popl_cfi %es - /*CFI_RESTORE es;*/ -3: popl_cfi %fs - /*CFI_RESTORE fs;*/ +1: popl %ds +2: popl %es +3: popl %fs POP_GS \pop .pushsection .fixup, "ax" 4: movl $0, (%esp) @@ -254,64 +227,27 @@ POP_GS_EX .endm -.macro RING0_INT_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 3*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_EC_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 4*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_PTREGS_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, PT_OLDESP-PT_EBX - /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ - CFI_OFFSET eip, PT_EIP-PT_OLDESP - /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ - /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ - CFI_OFFSET eax, PT_EAX-PT_OLDESP - CFI_OFFSET ebp, PT_EBP-PT_OLDESP - CFI_OFFSET edi, PT_EDI-PT_OLDESP - CFI_OFFSET esi, PT_ESI-PT_OLDESP - CFI_OFFSET edx, PT_EDX-PT_OLDESP - CFI_OFFSET ecx, PT_ECX-PT_OLDESP - CFI_OFFSET ebx, PT_EBX-PT_OLDESP -.endm - ENTRY(ret_from_fork) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl jmp syscall_exit - CFI_ENDPROC END(ret_from_fork) ENTRY(ret_from_kernel_thread) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl movl PT_EBP(%esp),%eax call *PT_EBX(%esp) movl $0,PT_EAX(%esp) jmp syscall_exit - CFI_ENDPROC ENDPROC(ret_from_kernel_thread) /* @@ -323,7 +259,6 @@ ENDPROC(ret_from_kernel_thread) # userspace resumption stub bypassing syscall exit tracing ALIGN - RING0_PTREGS_FRAME ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: @@ -367,17 +302,12 @@ need_resched: jmp need_resched END(resume_kernel) #endif - CFI_ENDPROC /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub ENTRY(ia32_sysenter_target) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 0 - CFI_REGISTER esp, ebp movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* @@ -385,14 +315,11 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl_cfi $__USER_DS - /*CFI_REL_OFFSET ss, 0*/ - pushl_cfi %ebp - CFI_REL_OFFSET esp, 0 - pushfl_cfi + pushl $__USER_DS + pushl %ebp + pushfl orl $X86_EFLAGS_IF, (%esp) - pushl_cfi $__USER_CS - /*CFI_REL_OFFSET cs, 0*/ + pushl $__USER_CS /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary: TI_sysenter_return @@ -401,10 +328,9 @@ sysenter_past_esp: * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; * and THREAD_SIZE takes us to the bottom. */ - pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - CFI_REL_OFFSET eip, 0 + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - pushl_cfi %eax + pushl %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -453,11 +379,11 @@ sysenter_audit: /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl_cfi PT_ESI(%esp) /* a3: 5th arg */ - pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ call __audit_syscall_entry - popl_cfi %ecx /* get that remapped edx off the stack */ - popl_cfi %ecx /* get that remapped esi off the stack */ + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -480,7 +406,6 @@ sysexit_audit: jmp sysenter_exit #endif - CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) jmp 1b @@ -491,9 +416,8 @@ ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) - RING0_INT_FRAME # can't unwind into user space anyway ASM_CLAC - pushl_cfi %eax # save orig_eax + pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation @@ -527,7 +451,6 @@ restore_all_notrace: movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS #endif restore_nocheck: @@ -543,7 +466,6 @@ ENTRY(iret_exc) _ASM_EXTABLE(irq_return,iret_exc) #ifdef CONFIG_X86_ESPFIX32 - CFI_RESTORE_STATE ldt_ss: #ifdef CONFIG_PARAVIRT /* @@ -577,22 +499,19 @@ ldt_ss: shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl_cfi $__ESPFIX_SS - pushl_cfi %eax /* new kernel esp */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) lss (%esp), %esp /* switch to espfix segment */ - CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck #endif - CFI_ENDPROC ENDPROC(system_call) # perform work that needs to be done immediately before resumption ALIGN - RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig @@ -634,9 +553,9 @@ work_notifysig: # deal with pending signals and #ifdef CONFIG_VM86 ALIGN work_notifysig_v86: - pushl_cfi %ecx # save ti_flags for do_notify_resume + pushl %ecx # save ti_flags for do_notify_resume call save_v86_state # %eax contains pt_regs pointer - popl_cfi %ecx + popl %ecx movl %eax, %esp jmp 1b #endif @@ -666,9 +585,7 @@ syscall_exit_work: call syscall_trace_leave jmp resume_userspace END(syscall_exit_work) - CFI_ENDPROC - RING0_INT_FRAME # can't unwind into user space anyway syscall_fault: ASM_CLAC GET_THREAD_INFO(%ebp) @@ -685,7 +602,6 @@ sysenter_badsys: movl $-ENOSYS,%eax jmp sysenter_after_call END(sysenter_badsys) - CFI_ENDPROC .macro FIXUP_ESPFIX_STACK /* @@ -701,10 +617,9 @@ END(sysenter_badsys) mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ - pushl_cfi $__KERNEL_DS - pushl_cfi %eax + pushl $__KERNEL_DS + pushl %eax lss (%esp), %esp /* switch to the normal stack segment */ - CFI_ADJUST_CFA_OFFSET -8 #endif .endm .macro UNWIND_ESPFIX_STACK @@ -728,13 +643,11 @@ END(sysenter_badsys) */ .align 8 ENTRY(irq_entries_start) - RING0_INT_FRAME vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ + pushl $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -4 .align 8 .endr END(irq_entries_start) @@ -753,19 +666,16 @@ common_interrupt: call do_IRQ jmp ret_from_intr ENDPROC(common_interrupt) - CFI_ENDPROC #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ - RING0_INT_FRAME; \ ASM_CLAC; \ - pushl_cfi $~(nr); \ + pushl $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ call fn; \ jmp ret_from_intr; \ - CFI_ENDPROC; \ ENDPROC(name) @@ -784,37 +694,31 @@ ENDPROC(name) #include ENTRY(coprocessor_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_error + pushl $0 + pushl $do_coprocessor_error jmp error_code - CFI_ENDPROC END(coprocessor_error) ENTRY(simd_coprocessor_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 + pushl $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl_cfi $do_general_protection", \ + ALTERNATIVE "pushl $do_general_protection", \ "pushl $do_simd_coprocessor_error", \ X86_FEATURE_XMM #else - pushl_cfi $do_simd_coprocessor_error + pushl $do_simd_coprocessor_error #endif jmp error_code - CFI_ENDPROC END(simd_coprocessor_error) ENTRY(device_not_available) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $-1 # mark this as an int - pushl_cfi $do_device_not_available + pushl $-1 # mark this as an int + pushl $do_device_not_available jmp error_code - CFI_ENDPROC END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -830,115 +734,89 @@ END(native_irq_enable_sysexit) #endif ENTRY(overflow) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_overflow + pushl $0 + pushl $do_overflow jmp error_code - CFI_ENDPROC END(overflow) ENTRY(bounds) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_bounds + pushl $0 + pushl $do_bounds jmp error_code - CFI_ENDPROC END(bounds) ENTRY(invalid_op) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_invalid_op + pushl $0 + pushl $do_invalid_op jmp error_code - CFI_ENDPROC END(invalid_op) ENTRY(coprocessor_segment_overrun) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_segment_overrun + pushl $0 + pushl $do_coprocessor_segment_overrun jmp error_code - CFI_ENDPROC END(coprocessor_segment_overrun) ENTRY(invalid_TSS) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_invalid_TSS + pushl $do_invalid_TSS jmp error_code - CFI_ENDPROC END(invalid_TSS) ENTRY(segment_not_present) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_segment_not_present + pushl $do_segment_not_present jmp error_code - CFI_ENDPROC END(segment_not_present) ENTRY(stack_segment) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_stack_segment + pushl $do_stack_segment jmp error_code - CFI_ENDPROC END(stack_segment) ENTRY(alignment_check) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_alignment_check + pushl $do_alignment_check jmp error_code - CFI_ENDPROC END(alignment_check) ENTRY(divide_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 # no error code - pushl_cfi $do_divide_error + pushl $0 # no error code + pushl $do_divide_error jmp error_code - CFI_ENDPROC END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi machine_check_vector + pushl $0 + pushl machine_check_vector jmp error_code - CFI_ENDPROC END(machine_check) #endif ENTRY(spurious_interrupt_bug) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_spurious_interrupt_bug + pushl $0 + pushl $do_spurious_interrupt_bug jmp error_code - CFI_ENDPROC END(spurious_interrupt_bug) #ifdef CONFIG_XEN /* Xen doesn't set %esp to be precisely what the normal sysenter entrypoint expects, so fix it up before using the normal path. */ ENTRY(xen_sysenter_target) - RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ - CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp - CFI_ENDPROC ENTRY(xen_hypervisor_callback) - CFI_STARTPROC - pushl_cfi $-1 /* orig_ax = -1 => not a system call */ + pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL TRACE_IRQS_OFF @@ -962,7 +840,6 @@ ENTRY(xen_do_upcall) call xen_maybe_preempt_hcall #endif jmp ret_from_intr - CFI_ENDPROC ENDPROC(xen_hypervisor_callback) # Hypervisor uses this for application faults while it executes. @@ -976,8 +853,7 @@ ENDPROC(xen_hypervisor_callback) # to pop the stack frame we end up in an infinite loop of failsafe callbacks. # We distinguish between categories by maintaining a status value in EAX. ENTRY(xen_failsafe_callback) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax movl $1,%eax 1: mov 4(%esp),%ds 2: mov 8(%esp),%es @@ -986,15 +862,13 @@ ENTRY(xen_failsafe_callback) /* EAX == 0 => Category 1 (Bad segment) EAX != 0 => Category 2 (Bad IRET) */ testl %eax,%eax - popl_cfi %eax + popl %eax lea 16(%esp),%esp - CFI_ADJUST_CFA_OFFSET -16 jz 5f jmp iret_exc -5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ +5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL jmp ret_from_exception - CFI_ENDPROC .section .fixup,"ax" 6: xorl %eax,%eax @@ -1195,34 +1069,28 @@ return_to_handler: #ifdef CONFIG_TRACING ENTRY(trace_page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $trace_do_page_fault + pushl $trace_do_page_fault jmp error_code - CFI_ENDPROC END(trace_page_fault) #endif ENTRY(page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_page_fault + pushl $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs @@ -1240,7 +1108,6 @@ error_code: movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception - CFI_ENDPROC END(page_fault) /* @@ -1261,29 +1128,24 @@ END(page_fault) jne \ok \label: movl TSS_sysenter_sp0 + \offset(%esp), %esp - CFI_DEF_CFA esp, 0 - CFI_UNDEFINED eip - pushfl_cfi - pushl_cfi $__KERNEL_CS - pushl_cfi $sysenter_past_esp - CFI_REL_OFFSET eip, 0 + pushfl + pushl $__KERNEL_CS + pushl $sysenter_past_esp .endm ENTRY(debug) - RING0_INT_FRAME ASM_CLAC cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: - pushl_cfi $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug jmp ret_from_exception - CFI_ENDPROC END(debug) /* @@ -1295,45 +1157,40 @@ END(debug) * fault happened on the sysenter path. */ ENTRY(nmi) - RING0_INT_FRAME ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 - pushl_cfi %eax + pushl %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax - popl_cfi %eax + popl %eax je nmi_espfix_stack #endif cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup - pushl_cfi %eax + pushl %eax movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. */ andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax - popl_cfi %eax + popl %eax jae nmi_stack_correct cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl_cfi %eax + pushl %eax SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi jmp restore_all_notrace - CFI_ENDPROC nmi_stack_fixup: - RING0_INT_FRAME FIX_STACK 12, nmi_stack_correct, 1 jmp nmi_stack_correct nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ cmpw $__KERNEL_CS,16(%esp) jne nmi_stack_correct cmpl $debug,(%esp) @@ -1345,57 +1202,48 @@ nmi_debug_stack_check: #ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * + /* * create the pointer to lss back */ - pushl_cfi %ss - pushl_cfi %esp + pushl %ss + pushl %esp addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl_cfi 16(%esp) + pushl 16(%esp) .endr - pushl_cfi %eax + pushl %eax SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 jmp irq_return #endif - CFI_ENDPROC END(nmi) ENTRY(int3) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 jmp ret_from_exception - CFI_ENDPROC END(int3) ENTRY(general_protection) - RING0_EC_FRAME - pushl_cfi $do_general_protection + pushl $do_general_protection jmp error_code - CFI_ENDPROC END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_async_page_fault + pushl $do_async_page_fault jmp error_code - CFI_ENDPROC END(async_page_fault) #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 47b95813dc37..b84cec50c8cf 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -19,8 +19,6 @@ * at the top of the kernel process stack. * * Some macro usage: - * - CFI macros are used to generate dwarf2 unwind information for better - * backtraces. They don't change any code. * - ENTRY/END Define functions in the symbol table. * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. * - idtentry - Define exception entry points. @@ -30,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -112,61 +109,6 @@ ENDPROC(native_usergs_sysret64) # define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ #endif -/* - * empty frame - */ - .macro EMPTY_FRAME start=1 offset=0 - .if \start - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,8+\offset - .else - CFI_DEF_CFA_OFFSET 8+\offset - .endif - .endm - -/* - * initial frame state for interrupts (and exceptions without error code) - */ - .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, 5*8+\offset - /*CFI_REL_OFFSET ss, 4*8+\offset*/ - CFI_REL_OFFSET rsp, 3*8+\offset - /*CFI_REL_OFFSET rflags, 2*8+\offset*/ - /*CFI_REL_OFFSET cs, 1*8+\offset*/ - CFI_REL_OFFSET rip, 0*8+\offset - .endm - -/* - * initial frame state for exceptions with error code (and interrupts - * with vector already pushed) - */ - .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, 1*8+\offset - .endm - -/* - * frame that enables passing a complete pt_regs to a C function. - */ - .macro DEFAULT_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset - CFI_REL_OFFSET rdi, RDI+\offset - CFI_REL_OFFSET rsi, RSI+\offset - CFI_REL_OFFSET rdx, RDX+\offset - CFI_REL_OFFSET rcx, RCX+\offset - CFI_REL_OFFSET rax, RAX+\offset - CFI_REL_OFFSET r8, R8+\offset - CFI_REL_OFFSET r9, R9+\offset - CFI_REL_OFFSET r10, R10+\offset - CFI_REL_OFFSET r11, R11+\offset - CFI_REL_OFFSET rbx, RBX+\offset - CFI_REL_OFFSET rbp, RBP+\offset - CFI_REL_OFFSET r12, R12+\offset - CFI_REL_OFFSET r13, R13+\offset - CFI_REL_OFFSET r14, R14+\offset - CFI_REL_OFFSET r15, R15+\offset - .endm - /* * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -196,12 +138,6 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(system_call) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -219,8 +155,8 @@ GLOBAL(system_call_after_swapgs) movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp /* Construct struct pt_regs on stack */ - pushq_cfi $__USER_DS /* pt_regs->ss */ - pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* * Re-enable interrupts. * We use 'rsp_scratch' as a scratch space, hence irq-off block above @@ -229,22 +165,20 @@ GLOBAL(system_call_after_swapgs) * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ - pushq_cfi_reg r8 /* pt_regs->r8 */ - pushq_cfi_reg r9 /* pt_regs->r9 */ - pushq_cfi_reg r10 /* pt_regs->r10 */ - pushq_cfi_reg r11 /* pt_regs->r11 */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 6*8 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys @@ -282,13 +216,9 @@ system_call_fastpath: testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ - CFI_REMEMBER_STATE - RESTORE_C_REGS_EXCEPT_RCX_R11 movq RIP(%rsp),%rcx - CFI_REGISTER rip,rcx movq EFLAGS(%rsp),%r11 - /*CFI_REGISTER rflags,r11*/ movq RSP(%rsp),%rsp /* * 64bit SYSRET restores rip from rcx, @@ -307,8 +237,6 @@ system_call_fastpath: */ USERGS_SYSRET64 - CFI_RESTORE_STATE - /* Do syscall entry tracing */ tracesys: movq %rsp, %rdi @@ -374,9 +302,9 @@ int_careful: jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -389,10 +317,10 @@ int_very_careful: /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal - pushq_cfi %rdi + pushq %rdi leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave - popq_cfi %rdi + popq %rdi andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest @@ -475,27 +403,21 @@ syscall_return: * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - CFI_REMEMBER_STATE /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp),%rsp USERGS_SYSRET64 - CFI_RESTORE_STATE opportunistic_sysret_failed: SWAPGS jmp restore_c_regs_and_iret - CFI_ENDPROC END(system_call) .macro FORK_LIKE func ENTRY(stub_\func) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* offset 8: return address */ SAVE_EXTRA_REGS 8 jmp sys_\func - CFI_ENDPROC END(stub_\func) .endm @@ -504,8 +426,6 @@ END(stub_\func) FORK_LIKE vfork ENTRY(stub_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execve return_from_execve: testl %eax, %eax @@ -515,11 +435,9 @@ return_from_execve: 1: /* must use IRET code path (pt_regs->cs may have changed) */ addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 ZERO_EXTRA_REGS movq %rax,RAX(%rsp) jmp int_ret_from_sys_call - CFI_ENDPROC END(stub_execve) /* * Remaining execve stubs are only 7 bytes long. @@ -527,32 +445,23 @@ END(stub_execve) */ .align 8 GLOBAL(stub_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub_execveat) #if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) GLOBAL(stub32_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve - CFI_ENDPROC END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) GLOBAL(stub32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call compat_sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub32_execveat) END(stub_x32_execveat) #endif @@ -562,8 +471,6 @@ END(stub_x32_execveat) * This cannot be done with SYSRET, so use the IRET return path instead. */ ENTRY(stub_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* * SAVE_EXTRA_REGS result is not normally needed: * sigreturn overwrites all pt_regs->GPREGS. @@ -575,21 +482,16 @@ ENTRY(stub_rt_sigreturn) call sys_rt_sigreturn return_from_stub: addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 RESTORE_EXTRA_REGS movq %rax,RAX(%rsp) jmp int_ret_from_sys_call - CFI_ENDPROC END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 SAVE_EXTRA_REGS 8 call sys32_x32_rt_sigreturn jmp return_from_stub - CFI_ENDPROC END(stub_x32_rt_sigreturn) #endif @@ -599,12 +501,11 @@ END(stub_x32_rt_sigreturn) * rdi: prev task we switched from */ ENTRY(ret_from_fork) - DEFAULT_FRAME LOCK ; btr $TIF_FORK,TI_flags(%r8) - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags + pushq $0x0002 + popfq # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter @@ -628,7 +529,6 @@ ENTRY(ret_from_fork) movl $0, RAX(%rsp) RESTORE_EXTRA_REGS jmp int_ret_from_sys_call - CFI_ENDPROC END(ret_from_fork) /* @@ -637,16 +537,13 @@ END(ret_from_fork) */ .align 8 ENTRY(irq_entries_start) - INTR_FRAME vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + pushq $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -8 .align 8 .endr - CFI_ENDPROC END(irq_entries_start) /* @@ -688,17 +585,7 @@ END(irq_entries_start) movq %rsp, %rsi incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - CFI_DEF_CFA_REGISTER rsi pushq %rsi - /* - * For debugger: - * "CFA (Current Frame Address) is the value on stack + offset" - */ - CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 (rsp) */, 0, \ - 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ - 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -711,7 +598,6 @@ END(irq_entries_start) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - XCPT_FRAME ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ @@ -723,11 +609,8 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ /* return code expects complete pt_regs - adjust rsp accordingly: */ leaq -RBP(%rsi),%rsp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP testb $3, CS(%rsp) jz retint_kernel @@ -743,7 +626,6 @@ retint_check: LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx andl %edi,%edx - CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: /* return to user-space */ @@ -807,8 +689,8 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq_cfi %rax - pushq_cfi %rdi + pushq %rax + pushq %rdi SWAPGS movq PER_CPU_VAR(espfix_waddr),%rdi movq %rax,(0*8)(%rdi) /* RAX */ @@ -823,24 +705,23 @@ native_irq_return_ldt: movq (5*8)(%rsp),%rax /* RSP */ movq %rax,(4*8)(%rdi) andl $0xffff0000,%eax - popq_cfi %rdi + popq %rdi orq PER_CPU_VAR(espfix_stack),%rax SWAPGS movq %rax,%rsp - popq_cfi %rax + popq %rax jmp native_irq_return_iret #endif /* edi: workmask, edx: work */ retint_careful: - CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -862,7 +743,6 @@ retint_signal: GET_THREAD_INFO(%rcx) jmp retint_with_reschedule - CFI_ENDPROC END(common_interrupt) /* @@ -870,13 +750,11 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) - INTR_FRAME ASM_CLAC - pushq_cfi $~(\num) + pushq $~(\num) .Lcommon_\sym: interrupt \do_sym jmp ret_from_intr - CFI_ENDPROC END(\sym) .endm @@ -959,24 +837,17 @@ ENTRY(\sym) .error "using shift_ist requires paranoid=1" .endif - .if \has_error_code - XCPT_FRAME - .else - INTR_FRAME - .endif - ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 - CFI_REMEMBER_STATE testb $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif @@ -986,8 +857,6 @@ ENTRY(\sym) .endif /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - DEFAULT_FRAME 0 - .if \paranoid .if \shift_ist != -1 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ @@ -1023,7 +892,6 @@ ENTRY(\sym) .endif .if \paranoid == 1 - CFI_RESTORE_STATE /* * Paranoid entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers @@ -1032,7 +900,6 @@ ENTRY(\sym) 1: call error_entry - DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ call sync_regs @@ -1051,8 +918,6 @@ ENTRY(\sym) jmp error_exit /* %ebx: no swapgs flag */ .endif - - CFI_ENDPROC END(\sym) .endm @@ -1085,17 +950,15 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 /* Reload gs selector with exception handling */ /* edi: new selector */ ENTRY(native_load_gs_index) - CFI_STARTPROC - pushfq_cfi + pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popfq_cfi + popfq ret - CFI_ENDPROC END(native_load_gs_index) _ASM_EXTABLE(gs_change,bad_gs) @@ -1110,22 +973,15 @@ bad_gs: /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(do_softirq_own_stack) - CFI_STARTPROC - pushq_cfi %rbp - CFI_REL_OFFSET rbp,0 + pushq %rbp mov %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp incl PER_CPU_VAR(irq_count) cmove PER_CPU_VAR(irq_stack_ptr),%rsp push %rbp # backlink for old unwinder call __do_softirq leaveq - CFI_RESTORE rbp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) ret - CFI_ENDPROC END(do_softirq_own_stack) #ifdef CONFIG_XEN @@ -1145,28 +1001,22 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * activation and restart the handler using the previous one. */ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) - CFI_STARTPROC /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ movq %rdi, %rsp # we don't return, adjust the stack frame - CFI_ENDPROC - DEFAULT_FRAME 11: incl PER_CPU_VAR(irq_count) movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp pushq %rbp # backlink for old unwinder call xen_evtchn_do_upcall popq %rsp - CFI_DEF_CFA_REGISTER rsp decl PER_CPU_VAR(irq_count) #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif jmp error_exit - CFI_ENDPROC END(xen_do_hypervisor_callback) /* @@ -1183,16 +1033,8 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) - INTR_FRAME 1 (6*8) - /*CFI_REL_OFFSET gs,GS*/ - /*CFI_REL_OFFSET fs,FS*/ - /*CFI_REL_OFFSET es,ES*/ - /*CFI_REL_OFFSET ds,DS*/ - CFI_REL_OFFSET r11,8 - CFI_REL_OFFSET rcx,0 movl %ds,%ecx cmpw %cx,0x10(%rsp) - CFI_REMEMBER_STATE jne 1f movl %es,%ecx cmpw %cx,0x18(%rsp) @@ -1205,29 +1047,21 @@ ENTRY(xen_failsafe_callback) jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ movq (%rsp),%rcx - CFI_RESTORE rcx movq 8(%rsp),%r11 - CFI_RESTORE r11 addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $0 /* RIP */ - pushq_cfi %r11 - pushq_cfi %rcx + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx jmp general_protection - CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp),%rcx - CFI_RESTORE rcx movq 8(%rsp),%r11 - CFI_RESTORE r11 addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $-1 /* orig_ax = -1 => not a system call */ + pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS jmp error_exit - CFI_ENDPROC END(xen_failsafe_callback) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1263,7 +1097,6 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector( * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1275,7 +1108,6 @@ ENTRY(paranoid_entry) SWAPGS xorl %ebx,%ebx 1: ret - CFI_ENDPROC END(paranoid_entry) /* @@ -1290,7 +1122,6 @@ END(paranoid_entry) */ /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) - DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ @@ -1305,7 +1136,6 @@ paranoid_exit_restore: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN - CFI_ENDPROC END(paranoid_exit) /* @@ -1313,7 +1143,6 @@ END(paranoid_exit) * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1333,7 +1162,6 @@ error_sti: * for these here too. */ error_kernelspace: - CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) @@ -1357,13 +1185,11 @@ error_bad_iret: mov %rax,%rsp decl %ebx /* Return to usergs */ jmp error_sti - CFI_ENDPROC END(error_entry) /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) - DEFAULT_FRAME movl %ebx,%eax RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) @@ -1377,12 +1203,10 @@ ENTRY(error_exit) andl %edi,%edx jnz retint_careful jmp retint_swapgs - CFI_ENDPROC END(error_exit) /* Runs on exception stack */ ENTRY(nmi) - INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME /* * We allow breakpoints in NMIs. If a breakpoint occurs, then @@ -1417,8 +1241,7 @@ ENTRY(nmi) */ /* Use %rdx as our temp variable throughout */ - pushq_cfi %rdx - CFI_REL_OFFSET rdx, 0 + pushq %rdx /* * If %cs was not the kernel segment, then the NMI triggered in user @@ -1452,8 +1275,6 @@ ENTRY(nmi) jb first_nmi /* Ah, it is within the NMI stack, treat it as nested */ - CFI_REMEMBER_STATE - nested_nmi: /* * Do nothing if we interrupted the fixup in repeat_nmi. @@ -1471,26 +1292,22 @@ nested_nmi: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 1*8 leaq -10*8(%rsp), %rdx - pushq_cfi $__KERNEL_DS - pushq_cfi %rdx - pushfq_cfi - pushq_cfi $__KERNEL_CS - pushq_cfi $repeat_nmi + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi /* Put stack back */ addq $(6*8), %rsp - CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: - popq_cfi %rdx - CFI_RESTORE rdx + popq %rdx /* No need to check faults here */ INTERRUPT_RETURN - CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1529,22 +1346,19 @@ first_nmi: */ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ movq (%rsp), %rdx - CFI_RESTORE rdx /* Set the NMI executing variable on the stack. */ - pushq_cfi $1 + pushq $1 /* * Leave room for the "copied" frame */ subq $(5*8), %rsp - CFI_ADJUST_CFA_OFFSET 5*8 /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 11*8(%rsp) + pushq 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1567,12 +1381,10 @@ repeat_nmi: /* Make another copy, this one may be modified by nested NMIs */ addq $(10*8), %rsp - CFI_ADJUST_CFA_OFFSET -10*8 .rept 5 - pushq_cfi -6*8(%rsp) + pushq -6*8(%rsp) .endr subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET 5*8 end_repeat_nmi: /* @@ -1580,7 +1392,7 @@ end_repeat_nmi: * NMI if the first NMI took an exception and reset our iret stack * so that we repeat another NMI. */ - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK /* @@ -1591,7 +1403,6 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry - DEFAULT_FRAME 0 /* * Save off the CR2 register. If we take a page fault in the NMI then @@ -1628,13 +1439,10 @@ nmi_restore: /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) jmp irq_return - CFI_ENDPROC END(nmi) ENTRY(ignore_sysret) - CFI_STARTPROC mov $-ENOSYS,%eax sysret - CFI_ENDPROC END(ignore_sysret) diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 00933d5e992f..9b0ca8fe80fc 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -11,26 +11,23 @@ #include #include -#include /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg - pushfl_cfi + pushfl cli .endm .macro UNLOCK reg - popfl_cfi + popfl .endm #define BEGIN(op) \ .macro endp; \ - CFI_ENDPROC; \ ENDPROC(atomic64_##op##_386); \ .purgem endp; \ .endm; \ ENTRY(atomic64_##op##_386); \ - CFI_STARTPROC; \ LOCK v; #define ENDP endp diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 082a85167a5b..db3ae85440ff 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -11,7 +11,6 @@ #include #include -#include .macro read64 reg movl %ebx, %eax @@ -22,16 +21,11 @@ .endm ENTRY(atomic64_read_cx8) - CFI_STARTPROC - read64 %ecx ret - CFI_ENDPROC ENDPROC(atomic64_read_cx8) ENTRY(atomic64_set_cx8) - CFI_STARTPROC - 1: /* we don't need LOCK_PREFIX since aligned 64-bit writes * are atomic on 586 and newer */ @@ -39,28 +33,23 @@ ENTRY(atomic64_set_cx8) jne 1b ret - CFI_ENDPROC ENDPROC(atomic64_set_cx8) ENTRY(atomic64_xchg_cx8) - CFI_STARTPROC - 1: LOCK_PREFIX cmpxchg8b (%esi) jne 1b ret - CFI_ENDPROC ENDPROC(atomic64_xchg_cx8) .macro addsub_return func ins insc ENTRY(atomic64_\func\()_return_cx8) - CFI_STARTPROC - pushl_cfi_reg ebp - pushl_cfi_reg ebx - pushl_cfi_reg esi - pushl_cfi_reg edi + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi movl %eax, %esi movl %edx, %edi @@ -79,12 +68,11 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg edi - popl_cfi_reg esi - popl_cfi_reg ebx - popl_cfi_reg ebp + popl %edi + popl %esi + popl %ebx + popl %ebp ret - CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) .endm @@ -93,8 +81,7 @@ addsub_return sub sub sbb .macro incdec_return func ins insc ENTRY(atomic64_\func\()_return_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -109,9 +96,8 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) .endm @@ -119,8 +105,7 @@ incdec_return inc add adc incdec_return dec sub sbb ENTRY(atomic64_dec_if_positive_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -136,18 +121,16 @@ ENTRY(atomic64_dec_if_positive_cx8) 2: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_dec_if_positive_cx8) ENTRY(atomic64_add_unless_cx8) - CFI_STARTPROC - pushl_cfi_reg ebp - pushl_cfi_reg ebx + pushl %ebp + pushl %ebx /* these just push these two parameters on the stack */ - pushl_cfi_reg edi - pushl_cfi_reg ecx + pushl %edi + pushl %ecx movl %eax, %ebp movl %edx, %edi @@ -168,21 +151,18 @@ ENTRY(atomic64_add_unless_cx8) movl $1, %eax 3: addl $8, %esp - CFI_ADJUST_CFA_OFFSET -8 - popl_cfi_reg ebx - popl_cfi_reg ebp + popl %ebx + popl %ebp ret 4: cmpl %edx, 4(%esp) jne 2b xorl %eax, %eax jmp 3b - CFI_ENDPROC ENDPROC(atomic64_add_unless_cx8) ENTRY(atomic64_inc_not_zero_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -199,7 +179,6 @@ ENTRY(atomic64_inc_not_zero_cx8) movl $1, %eax 3: - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 9bc944a91274..c1e623209853 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -26,7 +26,6 @@ */ #include -#include #include #include @@ -50,9 +49,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) * alignment for the unrolled loop. */ ENTRY(csum_partial) - CFI_STARTPROC - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %esi + pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -129,10 +127,9 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl_cfi_reg ebx - popl_cfi_reg esi + popl %ebx + popl %esi ret - CFI_ENDPROC ENDPROC(csum_partial) #else @@ -140,9 +137,8 @@ ENDPROC(csum_partial) /* Version for PentiumII/PPro */ ENTRY(csum_partial) - CFI_STARTPROC - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %esi + pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -249,10 +245,9 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl_cfi_reg ebx - popl_cfi_reg esi + popl %ebx + popl %esi ret - CFI_ENDPROC ENDPROC(csum_partial) #endif @@ -287,12 +282,10 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define FP 12 ENTRY(csum_partial_copy_generic) - CFI_STARTPROC subl $4,%esp - CFI_ADJUST_CFA_OFFSET 4 - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %edi + pushl %esi + pushl %ebx movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -401,12 +394,11 @@ DST( movb %cl, (%edi) ) .previous - popl_cfi_reg ebx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi %ecx # equivalent to addl $4,%esp + popl %ebx + popl %esi + popl %edi + popl %ecx # equivalent to addl $4,%esp ret - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) #else @@ -426,10 +418,9 @@ ENDPROC(csum_partial_copy_generic) #define ARGBASE 12 ENTRY(csum_partial_copy_generic) - CFI_STARTPROC - pushl_cfi_reg ebx - pushl_cfi_reg edi - pushl_cfi_reg esi + pushl %ebx + pushl %edi + pushl %esi movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -489,11 +480,10 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebx + popl %esi + popl %edi + popl %ebx ret - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) #undef ROUND diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index e67e579c93bd..a2fe51b00cce 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,4 @@ #include -#include #include #include @@ -15,7 +14,6 @@ * %rdi - page */ ENTRY(clear_page) - CFI_STARTPROC ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ "jmp clear_page_c_e", X86_FEATURE_ERMS @@ -24,11 +22,9 @@ ENTRY(clear_page) xorl %eax,%eax rep stosq ret - CFI_ENDPROC ENDPROC(clear_page) ENTRY(clear_page_orig) - CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx @@ -48,14 +44,11 @@ ENTRY(clear_page_orig) jnz .Lloop nop ret - CFI_ENDPROC ENDPROC(clear_page_orig) ENTRY(clear_page_c_e) - CFI_STARTPROC movl $4096,%ecx xorl %eax,%eax rep stosb ret - CFI_ENDPROC ENDPROC(clear_page_c_e) diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 40a172541ee2..9b330242e740 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -6,7 +6,6 @@ * */ #include -#include #include .text @@ -21,7 +20,6 @@ * %al : Operation successful */ ENTRY(this_cpu_cmpxchg16b_emu) -CFI_STARTPROC # # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not @@ -32,7 +30,7 @@ CFI_STARTPROC # *atomic* on a single cpu (as provided by the this_cpu_xx class of # macros). # - pushfq_cfi + pushfq cli cmpq PER_CPU_VAR((%rsi)), %rax @@ -43,17 +41,13 @@ CFI_STARTPROC movq %rbx, PER_CPU_VAR((%rsi)) movq %rcx, PER_CPU_VAR(8(%rsi)) - CFI_REMEMBER_STATE - popfq_cfi + popfq mov $1, %al ret - CFI_RESTORE_STATE .Lnot_same: - popfq_cfi + popfq xor %al,%al ret -CFI_ENDPROC - ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index b4807fce5177..ad5349778490 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,7 +7,6 @@ */ #include -#include .text @@ -20,14 +19,13 @@ * %ecx : high 32 bits of new value */ ENTRY(cmpxchg8b_emu) -CFI_STARTPROC # # Emulate 'cmpxchg8b (%esi)' on UP except we don't # set the whole ZF thing (caller will just compare # eax:edx with the expected value) # - pushfl_cfi + pushfl cli cmpl (%esi), %eax @@ -38,18 +36,15 @@ CFI_STARTPROC movl %ebx, (%esi) movl %ecx, 4(%esi) - CFI_REMEMBER_STATE - popfl_cfi + popfl ret - CFI_RESTORE_STATE .Lnot_same: movl (%esi), %eax .Lhalf_same: movl 4(%esi), %edx - popfl_cfi + popfl ret -CFI_ENDPROC ENDPROC(cmpxchg8b_emu) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 8239dbcbf984..009f98216b7e 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,7 +1,6 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ #include -#include #include #include @@ -13,22 +12,16 @@ */ ALIGN ENTRY(copy_page) - CFI_STARTPROC ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret - CFI_ENDPROC ENDPROC(copy_page) ENTRY(copy_page_regs) - CFI_STARTPROC subq $2*8, %rsp - CFI_ADJUST_CFA_OFFSET 2*8 movq %rbx, (%rsp) - CFI_REL_OFFSET rbx, 0 movq %r12, 1*8(%rsp) - CFI_REL_OFFSET r12, 1*8 movl $(4096/64)-5, %ecx .p2align 4 @@ -87,11 +80,7 @@ ENTRY(copy_page_regs) jnz .Loop2 movq (%rsp), %rbx - CFI_RESTORE rbx movq 1*8(%rsp), %r12 - CFI_RESTORE r12 addq $2*8, %rsp - CFI_ADJUST_CFA_OFFSET -2*8 ret - CFI_ENDPROC ENDPROC(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index e4b3beee83bd..982ce34f4a9b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,7 +7,6 @@ */ #include -#include #include #include #include @@ -18,7 +17,6 @@ /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) - CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx @@ -30,12 +28,10 @@ ENTRY(_copy_to_user) X86_FEATURE_REP_GOOD, \ "jmp copy_user_enhanced_fast_string", \ X86_FEATURE_ERMS - CFI_ENDPROC ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(_copy_from_user) - CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx @@ -47,14 +43,12 @@ ENTRY(_copy_from_user) X86_FEATURE_REP_GOOD, \ "jmp copy_user_enhanced_fast_string", \ X86_FEATURE_ERMS - CFI_ENDPROC ENDPROC(_copy_from_user) .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) bad_from_user: - CFI_STARTPROC movl %edx,%ecx xorl %eax,%eax rep @@ -62,7 +56,6 @@ bad_from_user: bad_to_user: movl %edx,%eax ret - CFI_ENDPROC ENDPROC(bad_from_user) .previous @@ -80,7 +73,6 @@ ENDPROC(bad_from_user) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_unrolled) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ @@ -162,7 +154,6 @@ ENTRY(copy_user_generic_unrolled) _ASM_EXTABLE(19b,40b) _ASM_EXTABLE(21b,50b) _ASM_EXTABLE(22b,50b) - CFI_ENDPROC ENDPROC(copy_user_generic_unrolled) /* Some CPUs run faster using the string copy instructions. @@ -184,7 +175,6 @@ ENDPROC(copy_user_generic_unrolled) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_string) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 2f /* less than 8 bytes, go to byte copy loop */ @@ -209,7 +199,6 @@ ENTRY(copy_user_generic_string) _ASM_EXTABLE(1b,11b) _ASM_EXTABLE(3b,12b) - CFI_ENDPROC ENDPROC(copy_user_generic_string) /* @@ -225,7 +214,6 @@ ENDPROC(copy_user_generic_string) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_enhanced_fast_string) - CFI_STARTPROC ASM_STAC movl %edx,%ecx 1: rep @@ -240,7 +228,6 @@ ENTRY(copy_user_enhanced_fast_string) .previous _ASM_EXTABLE(1b,12b) - CFI_ENDPROC ENDPROC(copy_user_enhanced_fast_string) /* @@ -248,7 +235,6 @@ ENDPROC(copy_user_enhanced_fast_string) * This will force destination/source out of cache for more performance. */ ENTRY(__copy_user_nocache) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ @@ -332,5 +318,4 @@ ENTRY(__copy_user_nocache) _ASM_EXTABLE(19b,40b) _ASM_EXTABLE(21b,50b) _ASM_EXTABLE(22b,50b) - CFI_ENDPROC ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 9734182966f3..7e48807b2fa1 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -6,7 +6,6 @@ * for more details. No warranty for anything given at all. */ #include -#include #include #include @@ -47,23 +46,16 @@ ENTRY(csum_partial_copy_generic) - CFI_STARTPROC cmpl $3*64, %edx jle .Lignore .Lignore: subq $7*8, %rsp - CFI_ADJUST_CFA_OFFSET 7*8 movq %rbx, 2*8(%rsp) - CFI_REL_OFFSET rbx, 2*8 movq %r12, 3*8(%rsp) - CFI_REL_OFFSET r12, 3*8 movq %r14, 4*8(%rsp) - CFI_REL_OFFSET r14, 4*8 movq %r13, 5*8(%rsp) - CFI_REL_OFFSET r13, 5*8 movq %rbp, 6*8(%rsp) - CFI_REL_OFFSET rbp, 6*8 movq %r8, (%rsp) movq %r9, 1*8(%rsp) @@ -206,22 +198,14 @@ ENTRY(csum_partial_copy_generic) addl %ebx, %eax adcl %r9d, %eax /* carry */ - CFI_REMEMBER_STATE .Lende: movq 2*8(%rsp), %rbx - CFI_RESTORE rbx movq 3*8(%rsp), %r12 - CFI_RESTORE r12 movq 4*8(%rsp), %r14 - CFI_RESTORE r14 movq 5*8(%rsp), %r13 - CFI_RESTORE r13 movq 6*8(%rsp), %rbp - CFI_RESTORE rbp addq $7*8, %rsp - CFI_ADJUST_CFA_OFFSET -7*8 ret - CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: @@ -237,5 +221,4 @@ ENTRY(csum_partial_copy_generic) jz .Lende movl $-EFAULT, (%rax) jmp .Lende - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index a4512359656a..46668cda4ffd 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -26,7 +26,6 @@ */ #include -#include #include #include #include @@ -36,7 +35,6 @@ .text ENTRY(__get_user_1) - CFI_STARTPROC GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user @@ -45,11 +43,9 @@ ENTRY(__get_user_1) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_1) ENTRY(__get_user_2) - CFI_STARTPROC add $1,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) @@ -60,11 +56,9 @@ ENTRY(__get_user_2) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_2) ENTRY(__get_user_4) - CFI_STARTPROC add $3,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) @@ -75,11 +69,9 @@ ENTRY(__get_user_4) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_4) ENTRY(__get_user_8) - CFI_STARTPROC #ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user @@ -104,28 +96,23 @@ ENTRY(__get_user_8) ASM_CLAC ret #endif - CFI_ENDPROC ENDPROC(__get_user_8) bad_get_user: - CFI_STARTPROC xor %edx,%edx mov $(-EFAULT),%_ASM_AX ASM_CLAC ret - CFI_ENDPROC END(bad_get_user) #ifdef CONFIG_X86_32 bad_get_user_8: - CFI_STARTPROC xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX ASM_CLAC ret - CFI_ENDPROC END(bad_get_user_8) #endif diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index 05a95e713da8..33147fef3452 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -16,15 +16,12 @@ */ #include -#include /* * override generic version in lib/iomap_copy.c */ ENTRY(__iowrite32_copy) - CFI_STARTPROC movl %edx,%ecx rep movsd ret - CFI_ENDPROC ENDPROC(__iowrite32_copy) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index b046664f5a1c..16698bba87de 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -2,7 +2,6 @@ #include #include -#include #include /* @@ -53,7 +52,6 @@ ENTRY(memcpy_erms) ENDPROC(memcpy_erms) ENTRY(memcpy_orig) - CFI_STARTPROC movq %rdi, %rax cmpq $0x20, %rdx @@ -178,5 +176,4 @@ ENTRY(memcpy_orig) .Lend: retq - CFI_ENDPROC ENDPROC(memcpy_orig) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0f8a0d0331b9..ca2afdd6d98e 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -6,7 +6,6 @@ * - Copyright 2011 Fenghua Yu */ #include -#include #include #include @@ -27,7 +26,6 @@ ENTRY(memmove) ENTRY(__memmove) - CFI_STARTPROC /* Handle more 32 bytes in loop */ mov %rdi, %rax @@ -207,6 +205,5 @@ ENTRY(__memmove) movb %r11b, (%rdi) 13: retq - CFI_ENDPROC ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 93118fb23976..2661fad05827 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,7 +1,6 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ #include -#include #include #include @@ -66,7 +65,6 @@ ENTRY(memset_erms) ENDPROC(memset_erms) ENTRY(memset_orig) - CFI_STARTPROC movq %rdi,%r10 /* expand byte value */ @@ -78,7 +76,6 @@ ENTRY(memset_orig) movl %edi,%r9d andl $7,%r9d jnz .Lbad_alignment - CFI_REMEMBER_STATE .Lafter_bad_alignment: movq %rdx,%rcx @@ -128,7 +125,6 @@ ENTRY(memset_orig) movq %r10,%rax ret - CFI_RESTORE_STATE .Lbad_alignment: cmpq $7,%rdx jbe .Lhandle_7 @@ -139,5 +135,4 @@ ENTRY(memset_orig) subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: - CFI_ENDPROC ENDPROC(memset_orig) diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 3ca5218fbece..c81556409bbb 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -1,6 +1,5 @@ #include #include -#include #include #include @@ -13,9 +12,8 @@ */ .macro op_safe_regs op ENTRY(\op\()_safe_regs) - CFI_STARTPROC - pushq_cfi_reg rbx - pushq_cfi_reg rbp + pushq %rbx + pushq %rbp movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax @@ -25,7 +23,6 @@ ENTRY(\op\()_safe_regs) movl 20(%rdi), %ebp movl 24(%rdi), %esi movl 28(%rdi), %edi - CFI_REMEMBER_STATE 1: \op 2: movl %eax, (%r10) movl %r11d, %eax /* Return value */ @@ -35,16 +32,14 @@ ENTRY(\op\()_safe_regs) movl %ebp, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq_cfi_reg rbp - popq_cfi_reg rbx + popq %rbp + popq %rbx ret 3: - CFI_RESTORE_STATE movl $-EIO, %r11d jmp 2b _ASM_EXTABLE(1b, 3b) - CFI_ENDPROC ENDPROC(\op\()_safe_regs) .endm @@ -52,13 +47,12 @@ ENDPROC(\op\()_safe_regs) .macro op_safe_regs op ENTRY(\op\()_safe_regs) - CFI_STARTPROC - pushl_cfi_reg ebx - pushl_cfi_reg ebp - pushl_cfi_reg esi - pushl_cfi_reg edi - pushl_cfi $0 /* Return value */ - pushl_cfi %eax + pushl %ebx + pushl %ebp + pushl %esi + pushl %edi + pushl $0 /* Return value */ + pushl %eax movl 4(%eax), %ecx movl 8(%eax), %edx movl 12(%eax), %ebx @@ -66,32 +60,28 @@ ENTRY(\op\()_safe_regs) movl 24(%eax), %esi movl 28(%eax), %edi movl (%eax), %eax - CFI_REMEMBER_STATE 1: \op -2: pushl_cfi %eax +2: pushl %eax movl 4(%esp), %eax - popl_cfi (%eax) + popl (%eax) addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 movl %ecx, 4(%eax) movl %edx, 8(%eax) movl %ebx, 12(%eax) movl %ebp, 20(%eax) movl %esi, 24(%eax) movl %edi, 28(%eax) - popl_cfi %eax - popl_cfi_reg edi - popl_cfi_reg esi - popl_cfi_reg ebp - popl_cfi_reg ebx + popl %eax + popl %edi + popl %esi + popl %ebp + popl %ebx ret 3: - CFI_RESTORE_STATE movl $-EIO, 4(%esp) jmp 2b _ASM_EXTABLE(1b, 3b) - CFI_ENDPROC ENDPROC(\op\()_safe_regs) .endm diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index fc6ba17a7eec..e0817a12d323 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -11,7 +11,6 @@ * return value. */ #include -#include #include #include #include @@ -30,11 +29,9 @@ * as they get called from within inline assembly. */ -#define ENTER CFI_STARTPROC ; \ - GET_THREAD_INFO(%_ASM_BX) +#define ENTER GET_THREAD_INFO(%_ASM_BX) #define EXIT ASM_CLAC ; \ - ret ; \ - CFI_ENDPROC + ret .text ENTRY(__put_user_1) @@ -87,7 +84,6 @@ ENTRY(__put_user_8) ENDPROC(__put_user_8) bad_put_user: - CFI_STARTPROC movl $-EFAULT,%eax EXIT END(bad_put_user) diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 2322abe4da3b..40027db99140 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -15,7 +15,6 @@ #include #include -#include #define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) #define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) @@ -34,10 +33,10 @@ */ #define save_common_regs \ - pushl_cfi_reg ecx + pushl %ecx #define restore_common_regs \ - popl_cfi_reg ecx + popl %ecx /* Avoid uglifying the argument copying x86-64 needs to do. */ .macro movq src, dst @@ -64,50 +63,45 @@ */ #define save_common_regs \ - pushq_cfi_reg rdi; \ - pushq_cfi_reg rsi; \ - pushq_cfi_reg rcx; \ - pushq_cfi_reg r8; \ - pushq_cfi_reg r9; \ - pushq_cfi_reg r10; \ - pushq_cfi_reg r11 + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 #define restore_common_regs \ - popq_cfi_reg r11; \ - popq_cfi_reg r10; \ - popq_cfi_reg r9; \ - popq_cfi_reg r8; \ - popq_cfi_reg rcx; \ - popq_cfi_reg rsi; \ - popq_cfi_reg rdi + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi #endif /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) - CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_down_read_failed - __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) - CFI_STARTPROC save_common_regs movq %rax,%rdi call rwsem_down_write_failed restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) - CFI_STARTPROC /* do nothing if still outstanding active readers */ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) jnz 1f @@ -116,17 +110,14 @@ ENTRY(call_rwsem_wake) call rwsem_wake restore_common_regs 1: ret - CFI_ENDPROC ENDPROC(call_rwsem_wake) ENTRY(call_rwsem_downgrade_wake) - CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_downgrade_wake - __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index 5eb715087b80..e9acf5f4fc92 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -6,16 +6,14 @@ */ #include #include - #include /* put return address in eax (arg1) */ .macro THUNK name, func, put_ret_addr_in_eax=0 .globl \name \name: - CFI_STARTPROC - pushl_cfi_reg eax - pushl_cfi_reg ecx - pushl_cfi_reg edx + pushl %eax + pushl %ecx + pushl %edx .if \put_ret_addr_in_eax /* Place EIP in the arg1 */ @@ -23,11 +21,10 @@ .endif call \func - popl_cfi_reg edx - popl_cfi_reg ecx - popl_cfi_reg eax + popl %edx + popl %ecx + popl %eax ret - CFI_ENDPROC _ASM_NOKPROBE(\name) .endm diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index f89ba4e93025..10f555e435e1 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -6,7 +6,6 @@ * Subject to the GNU public license, v.2. No warranty of any kind. */ #include -#include #include #include @@ -14,27 +13,25 @@ .macro THUNK name, func, put_ret_addr_in_rdi=0 .globl \name \name: - CFI_STARTPROC /* this one pushes 9 elems, the next one would be %rIP */ - pushq_cfi_reg rdi - pushq_cfi_reg rsi - pushq_cfi_reg rdx - pushq_cfi_reg rcx - pushq_cfi_reg rax - pushq_cfi_reg r8 - pushq_cfi_reg r9 - pushq_cfi_reg r10 - pushq_cfi_reg r11 + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 .if \put_ret_addr_in_rdi /* 9*8(%rsp) is return addr on stack */ - movq_cfi_restore 9*8, rdi + movq 9*8(%rsp), %rdi .endif call \func jmp restore - CFI_ENDPROC _ASM_NOKPROBE(\name) .endm @@ -57,19 +54,16 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPT) - CFI_STARTPROC - CFI_ADJUST_CFA_OFFSET 9*8 restore: - popq_cfi_reg r11 - popq_cfi_reg r10 - popq_cfi_reg r9 - popq_cfi_reg r8 - popq_cfi_reg rax - popq_cfi_reg rcx - popq_cfi_reg rdx - popq_cfi_reg rsi - popq_cfi_reg rdi + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi ret - CFI_ENDPROC _ASM_NOKPROBE(restore) #endif diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index 6440221ced0d..4093216b3791 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -8,7 +8,6 @@ * of the License. */ #include -#include /* * Calling convention : -- cgit v1.2.1 From 425be5679fd292a3c36cb1fe423086708a99f11a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 22 May 2015 16:15:47 -0700 Subject: x86/asm/irq: Stop relying on magic JMP behavior for early_idt_handlers The early_idt_handlers asm code generates an array of entry points spaced nine bytes apart. It's not really clear from that code or from the places that reference it what's going on, and the code only works in the first place because GAS never generates two-byte JMP instructions when jumping to global labels. Clean up the code to generate the correct array stride (member size) explicitly. This should be considerably more robust against screw-ups, as GAS will warn if a .fill directive has a negative count. Using '. =' to advance would have been even more robust (it would generate an actual error if it tried to move backwards), but it would pad with nulls, confusing anyone who tries to disassemble the code. The new scheme should be much clearer to future readers. While we're at it, improve the comments and rename the array and common code. Binutils may start relaxing jumps to non-weak labels. If so, this change will fix our build, and we may need to backport this change. Before, on x86_64: 0000000000000000 : 0: 6a 00 pushq $0x0 2: 6a 00 pushq $0x0 4: e9 00 00 00 00 jmpq 9 5: R_X86_64_PC32 early_idt_handler-0x4 ... 48: 66 90 xchg %ax,%ax 4a: 6a 08 pushq $0x8 4c: e9 00 00 00 00 jmpq 51 4d: R_X86_64_PC32 early_idt_handler-0x4 ... 117: 6a 00 pushq $0x0 119: 6a 1f pushq $0x1f 11b: e9 00 00 00 00 jmpq 120 11c: R_X86_64_PC32 early_idt_handler-0x4 After: 0000000000000000 : 0: 6a 00 pushq $0x0 2: 6a 00 pushq $0x0 4: e9 14 01 00 00 jmpq 11d ... 48: 6a 08 pushq $0x8 4a: e9 d1 00 00 00 jmpq 120 4f: cc int3 50: cc int3 ... 117: 6a 00 pushq $0x0 119: 6a 1f pushq $0x1f 11b: eb 03 jmp 120 11d: cc int3 11e: cc int3 11f: cc int3 Signed-off-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: Binutils Cc: Borislav Petkov Cc: H.J. Lu Cc: Jan Beulich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/ac027962af343b0c599cbfcf50b945ad2ef3d7a8.1432336324.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/segment.h | 14 ++++++++++++-- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/head_32.S | 33 ++++++++++++++++++--------------- arch/x86/kernel/head_64.S | 20 +++++++++++--------- 4 files changed, 42 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5a9856eb12ba..7d5a1929d76b 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -231,11 +231,21 @@ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ + +/* + * early_idt_handler_array is an array of entry points referenced in the + * early IDT. For simplicity, it's a real array with one entry point + * every nine bytes. That leaves room for an optional 'push $0' if the + * vector has no error code (two bytes), a 'push $vector_number' (two + * bytes), and a jump to the common entry code (up to five bytes). + */ +#define EARLY_IDT_HANDLER_SIZE 9 + #ifndef __ASSEMBLY__ -extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; +extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; #ifdef CONFIG_TRACING -# define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handler_array early_idt_handler_array #endif /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2b55ee6db053..5a4668136e98 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d031bad9e07e..53eeb226657c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -478,21 +478,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -524,26 +525,28 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI @@ -603,7 +606,7 @@ ex_entry: is_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ae6588b301c2..df7e78057ae0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,26 +321,28 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI @@ -412,7 +414,7 @@ ENTRY(early_idt_handler) is_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA -- cgit v1.2.1 From 2bf557ea3f49576fabe24cd5daf1a34e9ee22c3c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 1 Jun 2015 13:02:55 +0100 Subject: x86/asm/entry/64: Use negative immediates for stack adjustments Doing so allows adjustments by 128 bytes (occurring for REMOVE_PT_GPREGS_FROM_STACK 8 uses) to be expressed with a single byte immediate. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/556C660F020000780007FB60@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/calling.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 0d76accde45b..f4e6308c4200 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -89,7 +89,7 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - subq $15*8+\addskip, %rsp + addq $-(15*8+\addskip), %rsp .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 @@ -201,7 +201,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - addq $15*8+\addskip, %rsp + subq $-(15*8+\addskip), %rsp .endm .macro icebp -- cgit v1.2.1 From 2f63b9db7260beba3c19d66d6c11b0b78ea84a8c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 1 Jun 2015 13:03:59 +0100 Subject: x86/asm/entry/64: Fold identical code paths retint_kernel doesn't require %rcx to be pointing to thread info (anymore?), and the code on the two alternative paths is - not really surprisingly - identical. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/556C664F020000780007FB64@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b84cec50c8cf..4ad79e946f5a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -615,7 +615,7 @@ ret_from_intr: testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ - +retint_user: GET_THREAD_INFO(%rcx) /* * %rcx: thread info. Interrupts off. @@ -1194,15 +1194,9 @@ ENTRY(error_exit) RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) testl %eax,%eax jnz retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_swapgs + jmp retint_user END(error_exit) /* Runs on exception stack */ -- cgit v1.2.1 From ee098e1aed67715f0ce4651813d0c33ab3a56e0b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 1 Jun 2015 12:06:57 +0200 Subject: x86/cpu: Trim model ID whitespace We did try trimming whitespace surrounding the 'model name' field in /proc/cpuinfo since reportedly some userspace uses it in string comparisons and there were discrepancies: [thetango@prarit ~]# grep "^model name" /proc/cpuinfo | uniq -c | sed 's/\ /_/g' ______1_model_name :_AMD_Opteron(TM)_Processor_6272 _____63_model_name :_AMD_Opteron(TM)_Processor_6272_________________ However, there were issues with overlapping buffers, string sizes and non-byte-sized copies in the previous proposed solutions; see Link tags below for the whole farce. So, instead of diddling with this more, let's simply extend what was there originally with trimming any present trailing whitespace. Final result is really simple and obvious. Testing with the most insane model IDs qemu can generate, looks good: .model_id = " My funny model ID CPU ", ______4_model_name :_My_funny_model_ID_CPU .model_id = "My funny model ID CPU ", ______4_model_name :_My_funny_model_ID_CPU .model_id = " My funny model ID CPU", ______4_model_name :_My_funny_model_ID_CPU .model_id = " ", ______4_model_name :__ .model_id = "", ______4_model_name :_15/02 Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Igor Mammedov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1432050210-32036-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 41a8e9cb30bc..351197cbbc8e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -419,6 +420,7 @@ static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; + char *p, *q, *s; if (c->extended_cpuid_level < 0x80000004) return; @@ -429,11 +431,21 @@ static void get_model_name(struct cpuinfo_x86 *c) cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; - /* - * Remove leading whitespace on Intel processors and trailing - * whitespace on AMD processors. - */ - memmove(c->x86_model_id, strim(c->x86_model_id), 48); + /* Trim whitespace */ + p = q = s = &c->x86_model_id[0]; + + while (*p == ' ') + p++; + + while (*p) { + /* Note the last non-whitespace index */ + if (!isspace(*p)) + s = q; + + *q++ = *p++; + } + + *(s + 1) = '\0'; } void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) -- cgit v1.2.1 From af05b3009b6b106db60077db3aeef16ccd214000 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 28 May 2015 22:08:03 +0800 Subject: crypto: aesni - Convert top-level rfc4106 algorithm to new interface This patch converts rfc4106-gcm-aesni to the new AEAD interface. The low-level interface remains as is for now because we can't touch it until cryptd itself is upgraded. In the conversion I've also removed the duplicate copy of the context in the top-level algorithm. Now all processing is carried out in the low-level __driver-gcm-aes-aesni algorithm. Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 172 ++++++++++++++++++------------------- 1 file changed, 83 insertions(+), 89 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2ade24000246..5660a183420c 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -52,7 +52,6 @@ struct aesni_rfc4106_gcm_ctx { u8 hash_subkey[16]; struct crypto_aes_ctx aes_key_expanded; u8 nonce[4]; - struct cryptd_aead *cryptd_tfm; }; struct aesni_gcm_set_hash_subkey_result { @@ -790,37 +789,30 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, #endif #ifdef CONFIG_X86_64 -static int rfc4106_init(struct crypto_tfm *tfm) +static int rfc4106_init(struct crypto_aead *aead) { struct cryptd_aead *cryptd_tfm; - struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) - PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); - struct crypto_aead *cryptd_child; - struct aesni_rfc4106_gcm_ctx *child_ctx; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", CRYPTO_ALG_INTERNAL, CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); - cryptd_child = cryptd_aead_child(cryptd_tfm); - child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child); - memcpy(child_ctx, ctx, sizeof(*ctx)); - ctx->cryptd_tfm = cryptd_tfm; - crypto_aead_set_reqsize(__crypto_aead_cast(tfm), + *ctx = cryptd_tfm; + crypto_aead_set_reqsize( + aead, sizeof(struct aead_request) + crypto_aead_reqsize(&cryptd_tfm->base)); return 0; } -static void rfc4106_exit(struct crypto_tfm *tfm) +static void rfc4106_exit(struct crypto_aead *aead) { - struct aesni_rfc4106_gcm_ctx *ctx = - (struct aesni_rfc4106_gcm_ctx *) - PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); - if (!IS_ERR(ctx->cryptd_tfm)) - cryptd_free_aead(ctx->cryptd_tfm); - return; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); } static void @@ -951,18 +943,10 @@ exit: static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, unsigned int key_len) { - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); - struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); - struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child); - struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm; - int ret; + struct cryptd_aead **ctx = crypto_aead_ctx(parent); + struct cryptd_aead *cryptd_tfm = *ctx; - ret = crypto_aead_setkey(child, key, key_len); - if (!ret) { - memcpy(ctx, c_ctx, sizeof(*ctx)); - ctx->cryptd_tfm = cryptd_tfm; - } - return ret; + return crypto_aead_setkey(&cryptd_tfm->base, key, key_len); } static int common_rfc4106_set_authsize(struct crypto_aead *aead, @@ -985,14 +969,10 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead, static int rfc4106_set_authsize(struct crypto_aead *parent, unsigned int authsize) { - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); - struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); - int ret; + struct cryptd_aead **ctx = crypto_aead_ctx(parent); + struct cryptd_aead *cryptd_tfm = *ctx; - ret = crypto_aead_setauthsize(child, authsize); - if (!ret) - crypto_aead_crt(parent)->authsize = authsize; - return ret; + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); } static int __driver_rfc4106_encrypt(struct aead_request *req) @@ -1171,44 +1151,42 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) static int rfc4106_encrypt(struct aead_request *req) { - int ret; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + struct cryptd_aead **ctx = crypto_aead_ctx(tfm); + struct cryptd_aead *cryptd_tfm = *ctx; + struct aead_request *subreq = aead_request_ctx(req); - if (!irq_fpu_usable()) { - struct aead_request *cryptd_req = - (struct aead_request *) aead_request_ctx(req); + aead_request_set_tfm(subreq, irq_fpu_usable() ? + cryptd_aead_child(cryptd_tfm) : + &cryptd_tfm->base); - memcpy(cryptd_req, req, sizeof(*req)); - aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - ret = crypto_aead_encrypt(cryptd_req); - } else { - kernel_fpu_begin(); - ret = __driver_rfc4106_encrypt(req); - kernel_fpu_end(); - } - return ret; + aead_request_set_callback(subreq, req->base.flags, + req->base.complete, req->base.data); + aead_request_set_crypt(subreq, req->src, req->dst, + req->cryptlen, req->iv); + aead_request_set_ad(subreq, req->assoclen); + + return crypto_aead_encrypt(subreq); } static int rfc4106_decrypt(struct aead_request *req) { - int ret; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + struct cryptd_aead **ctx = crypto_aead_ctx(tfm); + struct cryptd_aead *cryptd_tfm = *ctx; + struct aead_request *subreq = aead_request_ctx(req); - if (!irq_fpu_usable()) { - struct aead_request *cryptd_req = - (struct aead_request *) aead_request_ctx(req); + aead_request_set_tfm(subreq, irq_fpu_usable() ? + cryptd_aead_child(cryptd_tfm) : + &cryptd_tfm->base); - memcpy(cryptd_req, req, sizeof(*req)); - aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - ret = crypto_aead_decrypt(cryptd_req); - } else { - kernel_fpu_begin(); - ret = __driver_rfc4106_decrypt(req); - kernel_fpu_end(); - } - return ret; + aead_request_set_callback(subreq, req->base.flags, + req->base.complete, req->base.data); + aead_request_set_crypt(subreq, req->src, req->dst, + req->cryptlen, req->iv); + aead_request_set_ad(subreq, req->assoclen); + + return crypto_aead_decrypt(subreq); } static int helper_rfc4106_encrypt(struct aead_request *req) @@ -1432,30 +1410,6 @@ static struct crypto_alg aesni_algs[] = { { .maxauthsize = 16, }, }, -}, { - .cra_name = "rfc4106(gcm(aes))", - .cra_driver_name = "rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + - AESNI_ALIGN, - .cra_alignmask = 0, - .cra_type = &crypto_nivaead_type, - .cra_module = THIS_MODULE, - .cra_init = rfc4106_init, - .cra_exit = rfc4106_exit, - .cra_u = { - .aead = { - .setkey = rfc4106_set_key, - .setauthsize = rfc4106_set_authsize, - .encrypt = rfc4106_encrypt, - .decrypt = rfc4106_decrypt, - .geniv = "seqiv", - .ivsize = 8, - .maxauthsize = 16, - }, - }, #endif #if IS_ENABLED(CONFIG_CRYPTO_PCBC) }, { @@ -1570,6 +1524,30 @@ static struct crypto_alg aesni_algs[] = { { }, } }; +#ifdef CONFIG_X86_64 +static struct aead_alg aesni_aead_algs[] = { { + .init = rfc4106_init, + .exit = rfc4106_exit, + .setkey = rfc4106_set_key, + .setauthsize = rfc4106_set_authsize, + .encrypt = rfc4106_encrypt, + .decrypt = rfc4106_decrypt, + .ivsize = 8, + .maxauthsize = 16, + .base = { + .cra_name = "rfc4106(gcm(aes))", + .cra_driver_name = "rfc4106-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), + .cra_module = THIS_MODULE, + }, +} }; +#else +static struct aead_alg aesni_aead_algs[0]; +#endif + static const struct x86_cpu_id aesni_cpu_id[] = { X86_FEATURE_MATCH(X86_FEATURE_AES), @@ -1617,11 +1595,27 @@ static int __init aesni_init(void) if (err) return err; - return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + if (err) + goto fpu_exit; + + err = crypto_register_aeads(aesni_aead_algs, + ARRAY_SIZE(aesni_aead_algs)); + if (err) + goto unregister_algs; + + return err; + +unregister_algs: + crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); +fpu_exit: + crypto_fpu_exit(); + return err; } static void __exit aesni_exit(void) { + crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); crypto_fpu_exit(); -- cgit v1.2.1 From b7c89d9e2fef1c9f4c9d8bacf0c1459e30561289 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 1 Jun 2015 15:53:06 +0800 Subject: crypto: aesni - Convert rfc4106 to new AEAD interface This patch converts the low-level __gcm-aes-aesni algorithm to the new AEAD interface. Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 250 ++++++++++++------------------------- 1 file changed, 83 insertions(+), 167 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 5660a183420c..ebcb981d241c 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -44,13 +44,18 @@ #endif +#define AESNI_ALIGN 16 +#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) +#define RFC4106_HASH_SUBKEY_SIZE 16 + /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. */ struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16]; - struct crypto_aes_ctx aes_key_expanded; + u8 hash_subkey[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + struct crypto_aes_ctx aes_key_expanded + __attribute__ ((__aligned__(AESNI_ALIGN))); u8 nonce[4]; }; @@ -65,10 +70,6 @@ struct aesni_hash_subkey_req_data { struct scatterlist sg; }; -#define AESNI_ALIGN (16) -#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) -#define RFC4106_HASH_SUBKEY_SIZE 16 - struct aesni_lrw_ctx { struct lrw_table_ctx lrw_table; u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; @@ -282,10 +283,11 @@ static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out, static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { - return - (struct aesni_rfc4106_gcm_ctx *) - PTR_ALIGN((u8 *) - crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN); + unsigned long align = AESNI_ALIGN; + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + return PTR_ALIGN(crypto_aead_ctx(tfm), align); } #endif @@ -838,8 +840,6 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) if (IS_ERR(ctr_tfm)) return PTR_ERR(ctr_tfm); - crypto_ablkcipher_clear_flags(ctr_tfm, ~0); - ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); if (ret) goto out_free_ablkcipher; @@ -888,56 +888,20 @@ out_free_ablkcipher: static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, unsigned int key_len) { - int ret = 0; - struct crypto_tfm *tfm = crypto_aead_tfm(aead); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); - u8 *new_key_align, *new_key_mem = NULL; if (key_len < 4) { - crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } /*Account for 4 byte nonce at the end.*/ key_len -= 4; - if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) { - crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); - /*This must be on a 16 byte boundary!*/ - if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN) - return -EINVAL; - - if ((unsigned long)key % AESNI_ALIGN) { - /*key is not aligned: use an auxuliar aligned pointer*/ - new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL); - if (!new_key_mem) - return -ENOMEM; - - new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN); - memcpy(new_key_align, key, key_len); - key = new_key_align; - } - if (!irq_fpu_usable()) - ret = crypto_aes_expand_key(&(ctx->aes_key_expanded), - key, key_len); - else { - kernel_fpu_begin(); - ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len); - kernel_fpu_end(); - } - /*This must be on a 16 byte boundary!*/ - if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) { - ret = -EINVAL; - goto exit; - } - ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); -exit: - kfree(new_key_mem); - return ret; + return aes_set_key_common(crypto_aead_tfm(aead), + &ctx->aes_key_expanded, key, key_len) ?: + rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); } static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, @@ -960,7 +924,7 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead, default: return -EINVAL; } - crypto_aead_crt(aead)->authsize = authsize; + return 0; } @@ -975,20 +939,17 @@ static int rfc4106_set_authsize(struct crypto_aead *parent, return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); } -static int __driver_rfc4106_encrypt(struct aead_request *req) +static int helper_rfc4106_encrypt(struct aead_request *req) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - u32 key_len = ctx->aes_key_expanded.key_length; void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv_tab[16+AESNI_ALIGN]; - u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); struct scatter_walk src_sg_walk; - struct scatter_walk assoc_sg_walk; struct scatter_walk dst_sg_walk; unsigned int i; @@ -997,12 +958,6 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) /* to 8 or 12 bytes */ if (unlikely(req->assoclen != 8 && req->assoclen != 12)) return -EINVAL; - if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) - return -EINVAL; - if (unlikely(key_len != AES_KEYSIZE_128 && - key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256)) - return -EINVAL; /* IV below built */ for (i = 0; i < 4; i++) @@ -1011,55 +966,57 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) *(iv+4+i) = req->iv[i]; *((__be32 *)(iv+12)) = counter; - if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { + if (sg_is_last(req->src) && + req->src->offset + req->src->length <= PAGE_SIZE && + sg_is_last(req->dst) && + req->dst->offset + req->dst->length <= PAGE_SIZE) { one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); - scatterwalk_start(&assoc_sg_walk, req->assoc); - src = scatterwalk_map(&src_sg_walk); - assoc = scatterwalk_map(&assoc_sg_walk); + assoc = scatterwalk_map(&src_sg_walk); + src = assoc + req->assoclen; dst = src; if (unlikely(req->src != req->dst)) { scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk); + dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; } - } else { /* Allocate memory for src, dst, assoc */ - src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, + assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, GFP_ATOMIC); - if (unlikely(!src)) + if (unlikely(!assoc)) return -ENOMEM; - assoc = (src + req->cryptlen + auth_tag_len); - scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); - scatterwalk_map_and_copy(assoc, req->assoc, 0, - req->assoclen, 0); + scatterwalk_map_and_copy(assoc, req->src, 0, + req->assoclen + req->cryptlen, 0); + src = assoc + req->assoclen; dst = src; } + kernel_fpu_begin(); aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst + ((unsigned long)req->cryptlen), auth_tag_len); + kernel_fpu_end(); /* The authTag (aka the Integrity Check Value) needs to be written * back to the packet. */ if (one_entry_in_sg) { if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst); - scatterwalk_done(&dst_sg_walk, 0, 0); + scatterwalk_unmap(dst - req->assoclen); + scatterwalk_advance(&dst_sg_walk, req->dst->length); + scatterwalk_done(&dst_sg_walk, 1, 0); } - scatterwalk_unmap(src); scatterwalk_unmap(assoc); - scatterwalk_done(&src_sg_walk, 0, 0); - scatterwalk_done(&assoc_sg_walk, 0, 0); + scatterwalk_advance(&src_sg_walk, req->src->length); + scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); } else { - scatterwalk_map_and_copy(dst, req->dst, 0, - req->cryptlen + auth_tag_len, 1); - kfree(src); + scatterwalk_map_and_copy(dst, req->dst, req->assoclen, + req->cryptlen + auth_tag_len, 1); + kfree(assoc); } return 0; } -static int __driver_rfc4106_decrypt(struct aead_request *req) +static int helper_rfc4106_decrypt(struct aead_request *req) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; @@ -1068,26 +1025,16 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) int retval = 0; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - u32 key_len = ctx->aes_key_expanded.key_length; void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv_and_authTag[32+AESNI_ALIGN]; - u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN); - u8 *authTag = iv + 16; + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 authTag[16]; struct scatter_walk src_sg_walk; - struct scatter_walk assoc_sg_walk; struct scatter_walk dst_sg_walk; unsigned int i; - if (unlikely((req->cryptlen < auth_tag_len) || - (req->assoclen != 8 && req->assoclen != 12))) + if (unlikely(req->assoclen != 8 && req->assoclen != 12)) return -EINVAL; - if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) - return -EINVAL; - if (unlikely(key_len != AES_KEYSIZE_128 && - key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256)) - return -EINVAL; /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length */ @@ -1101,33 +1048,36 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) *(iv+4+i) = req->iv[i]; *((__be32 *)(iv+12)) = counter; - if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { + if (sg_is_last(req->src) && + req->src->offset + req->src->length <= PAGE_SIZE && + sg_is_last(req->dst) && + req->dst->offset + req->dst->length <= PAGE_SIZE) { one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); - scatterwalk_start(&assoc_sg_walk, req->assoc); - src = scatterwalk_map(&src_sg_walk); - assoc = scatterwalk_map(&assoc_sg_walk); + assoc = scatterwalk_map(&src_sg_walk); + src = assoc + req->assoclen; dst = src; if (unlikely(req->src != req->dst)) { scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk); + dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; } } else { /* Allocate memory for src, dst, assoc */ - src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); - if (!src) + assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); + if (!assoc) return -ENOMEM; - assoc = (src + req->cryptlen); - scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); - scatterwalk_map_and_copy(assoc, req->assoc, 0, - req->assoclen, 0); + scatterwalk_map_and_copy(assoc, req->src, 0, + req->assoclen + req->cryptlen, 0); + src = assoc + req->assoclen; dst = src; } + kernel_fpu_begin(); aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, ctx->hash_subkey, assoc, (unsigned long)req->assoclen, authTag, auth_tag_len); + kernel_fpu_end(); /* Compare generated tag with passed in tag. */ retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ? @@ -1135,16 +1085,17 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) if (one_entry_in_sg) { if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst); - scatterwalk_done(&dst_sg_walk, 0, 0); + scatterwalk_unmap(dst - req->assoclen); + scatterwalk_advance(&dst_sg_walk, req->dst->length); + scatterwalk_done(&dst_sg_walk, 1, 0); } - scatterwalk_unmap(src); scatterwalk_unmap(assoc); - scatterwalk_done(&src_sg_walk, 0, 0); - scatterwalk_done(&assoc_sg_walk, 0, 0); + scatterwalk_advance(&src_sg_walk, req->src->length); + scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); } else { - scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1); - kfree(src); + scatterwalk_map_and_copy(dst, req->dst, req->assoclen, + tempCipherLen, 1); + kfree(assoc); } return retval; } @@ -1188,36 +1139,6 @@ static int rfc4106_decrypt(struct aead_request *req) return crypto_aead_decrypt(subreq); } - -static int helper_rfc4106_encrypt(struct aead_request *req) -{ - int ret; - - if (unlikely(!irq_fpu_usable())) { - WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); - ret = -EINVAL; - } else { - kernel_fpu_begin(); - ret = __driver_rfc4106_encrypt(req); - kernel_fpu_end(); - } - return ret; -} - -static int helper_rfc4106_decrypt(struct aead_request *req) -{ - int ret; - - if (unlikely(!irq_fpu_usable())) { - WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); - ret = -EINVAL; - } else { - kernel_fpu_begin(); - ret = __driver_rfc4106_decrypt(req); - kernel_fpu_end(); - } - return ret; -} #endif static struct crypto_alg aesni_algs[] = { { @@ -1389,27 +1310,6 @@ static struct crypto_alg aesni_algs[] = { { .geniv = "chainiv", }, }, -}, { - .cra_name = "__gcm-aes-aesni", - .cra_driver_name = "__driver-gcm-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + - AESNI_ALIGN, - .cra_alignmask = 0, - .cra_type = &crypto_aead_type, - .cra_module = THIS_MODULE, - .cra_u = { - .aead = { - .setkey = common_rfc4106_set_key, - .setauthsize = common_rfc4106_set_authsize, - .encrypt = helper_rfc4106_encrypt, - .decrypt = helper_rfc4106_decrypt, - .ivsize = 8, - .maxauthsize = 16, - }, - }, #endif #if IS_ENABLED(CONFIG_CRYPTO_PCBC) }, { @@ -1526,6 +1426,22 @@ static struct crypto_alg aesni_algs[] = { { #ifdef CONFIG_X86_64 static struct aead_alg aesni_aead_algs[] = { { + .setkey = common_rfc4106_set_key, + .setauthsize = common_rfc4106_set_authsize, + .encrypt = helper_rfc4106_encrypt, + .decrypt = helper_rfc4106_decrypt, + .ivsize = 8, + .maxauthsize = 16, + .base = { + .cra_name = "__gcm-aes-aesni", + .cra_driver_name = "__driver-gcm-aes-aesni", + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), + .cra_alignmask = AESNI_ALIGN - 1, + .cra_module = THIS_MODULE, + }, +}, { .init = rfc4106_init, .exit = rfc4106_exit, .setkey = rfc4106_set_key, -- cgit v1.2.1 From 6471b825c41e6fc3cd41caa18d15142d0e121e76 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 10:00:13 +0200 Subject: x86/kconfig: Reorganize arch feature Kconfig select's Peter Zijstra noticed that in arch/x86/Kconfig there are a lot of X86_{32,64} clauses in the X86 symbol, plus there are a number of similar selects in the X86_32 and X86_64 config definitions as well - which all overlap in an inconsistent mess. So: - move all select's from X86_32 and X86_64 to the X64 config option - sort their names, so that duplications are easier to spot - align their if clauses, so that they are easier to identify at a glance - and so that weirdnesses stand out more No change in functionality: 105 insertions(+) 105 deletions(-) Originally-from: Peter Zijlstra Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150602153027.GU3644@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 210 +++++++++++++++++++++++++++---------------------------- 1 file changed, 105 insertions(+), 105 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a708bcc1615c..8a5cca39e74f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -9,141 +9,141 @@ config 64BIT config X86_32 def_bool y depends on !64BIT - select CLKSRC_I8253 - select HAVE_UID16 config X86_64 def_bool y depends on 64BIT - select X86_DEV_DMA_OPS - select ARCH_USE_CMPXCHG_LOCKREF - select HAVE_LIVEPATCH ### Arch settings config X86 def_bool y - select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI + select ACPI_LEGACY_TABLES_LOOKUP if ACPI + select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI + select ANON_INODES + select ARCH_CLOCKSOURCE_DATA + select ARCH_DISCARD_MEMBLOCK + select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_SG_CHAIN + select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO - select HAVE_AOUT if X86_32 - select HAVE_UNSTABLE_SCHED_CLOCK - select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 - select ARCH_SUPPORTS_INT128 if X86_64 - select HAVE_IDE - select HAVE_OPROFILE - select HAVE_PCSPKR_PLATFORM - select HAVE_PERF_EVENTS - select HAVE_IOREMAP_PROT - select HAVE_KPROBES - select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK - select ARCH_WANT_OPTIONAL_GPIOLIB + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_INT128 if X86_64 + select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if X86_64 + select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_FRAME_POINTERS - select HAVE_DMA_ATTRS - select HAVE_DMA_CONTIGUOUS - select HAVE_KRETPROBES + select ARCH_WANT_IPC_PARSE_VERSION if X86_32 + select ARCH_WANT_OPTIONAL_GPIOLIB + select BUILDTIME_EXTABLE_SORT + select CLKEVT_I8253 + select CLKSRC_I8253 if X86_32 + select CLOCKSOURCE_VALIDATE_LAST_CYCLE + select CLOCKSOURCE_WATCHDOG + select CLONE_BACKWARDS if X86_32 + select COMPAT_OLD_SIGACTION if IA32_EMULATION + select DCACHE_WORD_ACCESS + select GENERIC_CLOCKEVENTS + select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) + select GENERIC_CLOCKEVENTS_MIN_ADJUST + select GENERIC_CMOS_UPDATE + select GENERIC_CPU_AUTOPROBE select GENERIC_EARLY_IOREMAP - select HAVE_OPTPROBES - select HAVE_KPROBES_ON_FTRACE - select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FENTRY if X86_64 + select GENERIC_FIND_FIRST_BIT + select GENERIC_IOMAP + select GENERIC_IRQ_PROBE + select GENERIC_IRQ_SHOW + select GENERIC_PENDING_IRQ if SMP + select GENERIC_SMP_IDLE_THREAD + select GENERIC_STRNCPY_FROM_USER + select GENERIC_STRNLEN_USER + select GENERIC_TIME_VSYSCALL + select HAVE_ACPI_APEI if ACPI + select HAVE_ACPI_APEI_NMI if ACPI + select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_AOUT if X86_32 + select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE + select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KGDB + select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_SOFT_DIRTY if X86_64 + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_BPF_JIT if X86_64 + select HAVE_CC_STACKPROTECTOR + select HAVE_CMPXCHG_DOUBLE + select HAVE_CMPXCHG_LOCAL + select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_C_RECORDMCOUNT + select HAVE_DEBUG_KMEMLEAK + select HAVE_DEBUG_STACKOVERFLOW + select HAVE_DMA_API_DEBUG + select HAVE_DMA_ATTRS + select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_FP_TEST - select HAVE_SYSCALL_TRACEPOINTS - select SYSCTL_EXCEPTION_TRACE - select HAVE_KVM - select HAVE_ARCH_KGDB - select HAVE_ARCH_TRACEHOOK - select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS - select USER_STACKTRACE_SUPPORT - select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_DMA_API_DEBUG - select HAVE_KERNEL_GZIP + select HAVE_FENTRY if X86_64 + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_GRAPH_FP_TEST + select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_TRACER + select HAVE_GENERIC_DMA_COHERENT if X86_32 + select HAVE_HW_BREAKPOINT + select HAVE_IDE + select HAVE_IOREMAP_PROT + select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 + select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 select HAVE_KERNEL_LZMA - select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO - select HAVE_KERNEL_LZ4 - select HAVE_HW_BREAKPOINT + select HAVE_KERNEL_XZ + select HAVE_KPROBES + select HAVE_KPROBES_ON_FTRACE + select HAVE_KRETPROBES + select HAVE_KVM + select HAVE_LIVEPATCH if X86_64 + select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS - select PERF_EVENTS + select HAVE_OPROFILE + select HAVE_OPTPROBES + select HAVE_PCSPKR_PLATFORM + select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_DEBUG_KMEMLEAK - select ANON_INODES - select HAVE_ALIGNED_STRUCT_PAGE if SLUB - select HAVE_CMPXCHG_LOCAL - select HAVE_CMPXCHG_DOUBLE - select HAVE_ARCH_KMEMCHECK - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_UID16 if X86_32 + select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER - select ARCH_HAS_ELF_RANDOMIZE - select HAVE_ARCH_JUMP_LABEL - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select SPARSE_IRQ - select GENERIC_FIND_FIRST_BIT - select GENERIC_IRQ_PROBE - select GENERIC_PENDING_IRQ if SMP - select GENERIC_IRQ_SHOW - select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING - select HAVE_BPF_JIT if X86_64 - select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE - select ARCH_HAS_SG_CHAIN - select CLKEVT_I8253 - select ARCH_HAVE_NMI_SAFE_CMPXCHG - select GENERIC_IOMAP - select DCACHE_WORD_ACCESS - select GENERIC_SMP_IDLE_THREAD - select ARCH_WANT_IPC_PARSE_VERSION if X86_32 - select HAVE_ARCH_SECCOMP_FILTER - select BUILDTIME_EXTABLE_SORT - select GENERIC_CMOS_UPDATE - select HAVE_ARCH_SOFT_DIRTY if X86_64 - select CLOCKSOURCE_WATCHDOG - select GENERIC_CLOCKEVENTS - select ARCH_CLOCKSOURCE_DATA - select CLOCKSOURCE_VALIDATE_LAST_CYCLE - select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) - select GENERIC_TIME_VSYSCALL - select GENERIC_STRNCPY_FROM_USER - select GENERIC_STRNLEN_USER - select HAVE_CONTEXT_TRACKING if X86_64 - select HAVE_IRQ_TIME_ACCOUNTING - select VIRT_TO_BUS - select MODULES_USE_ELF_REL if X86_32 - select MODULES_USE_ELF_RELA if X86_64 - select CLONE_BACKWARDS if X86_32 - select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_QUEUED_SPINLOCKS - select ARCH_USE_QUEUED_RWLOCKS - select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION - select OLD_SIGACTION if X86_32 - select COMPAT_OLD_SIGACTION if IA32_EMULATION + select MODULES_USE_ELF_RELA if X86_64 + select MODULES_USE_ELF_REL if X86_32 + select OLD_SIGACTION if X86_32 + select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION + select PERF_EVENTS select RTC_LIB - select HAVE_DEBUG_STACKOVERFLOW - select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 - select HAVE_CC_STACKPROTECTOR - select GENERIC_CPU_AUTOPROBE - select HAVE_ARCH_AUDITSYSCALL - select ARCH_SUPPORTS_ATOMIC_RMW - select HAVE_ACPI_APEI if ACPI - select HAVE_ACPI_APEI_NMI if ACPI - select ACPI_LEGACY_TABLES_LOOKUP if ACPI - select X86_FEATURE_NAMES if PROC_FS + select SPARSE_IRQ select SRCU + select SYSCTL_EXCEPTION_TRACE + select USER_STACKTRACE_SUPPORT + select VIRT_TO_BUS + select X86_DEV_DMA_OPS if X86_64 + select X86_FEATURE_NAMES if PROC_FS config INSTRUCTION_DECODER def_bool y -- cgit v1.2.1 From d6472302f242559d45dcf4ebace62508dc4d8aeb Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 2 Jun 2015 19:01:38 +1000 Subject: x86/mm: Decouple from Nothing in uses anything from , so remove it from there and fix up the resulting build problems triggered on x86 {64|32}-bit {def|allmod|allno}configs. The breakages were triggering in places where x86 builds relied on vmalloc() facilities but did not include explicitly and relied on the implicit inclusion via . Also add: - to - to ... which were two other implicit header file dependencies. Suggested-by: David Miller Signed-off-by: Stephen Rothwell [ Tidied up the changelog. ] Acked-by: David Miller Acked-by: Takashi Iwai Acked-by: Viresh Kumar Acked-by: Vinod Koul Cc: Andrew Morton Cc: Anton Vorontsov Cc: Boris Ostrovsky Cc: Colin Cross Cc: David Vrabel Cc: H. Peter Anvin Cc: Haiyang Zhang Cc: James E.J. Bottomley Cc: Jaroslav Kysela Cc: K. Y. Srinivasan Cc: Kees Cook Cc: Konrad Rzeszutek Wilk Cc: Kristen Carlson Accardi Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Suma Ramars Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 3 +-- arch/x86/kernel/crash.c | 1 + arch/x86/kernel/machine_kexec_64.c | 1 + arch/x86/mm/pageattr-test.c | 1 + arch/x86/mm/pageattr.c | 1 + arch/x86/xen/p2m.c | 1 + 6 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index a2b97404922d..a94463063b46 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -40,6 +40,7 @@ #include #include #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -198,8 +199,6 @@ extern void set_iounmap_nonlazy(void); #include -#include - /* * Convert a virtual cached pointer to an uncached pointer */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c76d3e37c6e1..e068d6683dba 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 415480d3ea84..11546b462fa6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 6629f397b467..8ff686aa7e8c 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 70d221fe2eb4..fae3c5366ac0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b47124d4cd67..8b7f18e200aa 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include -- cgit v1.2.1 From 905a36a2851838bca5a424fb758e201990234e6e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 13:37:36 +0200 Subject: x86/asm/entry: Move entry_64.S and entry_32.S to arch/x86/entry/ Create a new directory hierarchy for the low level x86 entry code: arch/x86/entry/* This will host all the low level glue that is currently scattered all across arch/x86/. Start with entry_64.S and entry_32.S. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kbuild | 3 + arch/x86/entry/Makefile | 4 + arch/x86/entry/entry_32.S | 1249 ++++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_64.S | 1442 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/entry_32.S | 1249 -------------------------------------- arch/x86/kernel/entry_64.S | 1442 -------------------------------------------- 7 files changed, 2699 insertions(+), 2692 deletions(-) create mode 100644 arch/x86/entry/Makefile create mode 100644 arch/x86/entry/entry_32.S create mode 100644 arch/x86/entry/entry_64.S delete mode 100644 arch/x86/kernel/entry_32.S delete mode 100644 arch/x86/kernel/entry_64.S (limited to 'arch/x86') diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 3942f74c92d7..b9b816277e72 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -1,3 +1,6 @@ + +obj-y += entry/ + obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile new file mode 100644 index 000000000000..fa7e0cf6d3c4 --- /dev/null +++ b/arch/x86/entry/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for the x86 low level entry code +# +obj-y := entry_$(BITS).o diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S new file mode 100644 index 000000000000..0ac73de925d1 --- /dev/null +++ b/arch/x86/entry/entry_32.S @@ -0,0 +1,1249 @@ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + + .section .entry.text, "ax" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#ifdef CONFIG_PREEMPT +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +#define preempt_stop(clobbers) +#define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs +.endm + +.macro POP_GS pop=0 +98: popl %gs + .if \pop <> 0 + add $\pop, %esp + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b,99b) +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b,99b) +.endm + +.macro GS_TO_REG reg + movl %gs, \reg +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds +2: popl %es +3: popl %fs + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.popsection + _ASM_EXTABLE(1b,4b) + _ASM_EXTABLE(2b,5b) + _ASM_EXTABLE(3b,6b) + POP_GS_EX +.endm + +ENTRY(ret_from_fork) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + jmp syscall_exit +END(ret_from_fork) + +ENTRY(ret_from_kernel_thread) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + movl PT_EBP(%esp),%eax + call *PT_EBX(%esp) + movl $0,PT_EAX(%esp) + jmp syscall_exit +ENDPROC(ret_from_kernel_thread) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from child spawned by kernel_thread(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) +need_resched: + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz restore_all + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(ia32_sysenter_target) + movl TSS_sysenter_sp0(%esp),%esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl $__USER_DS + pushl %ebp + pushfl + orl $X86_EFLAGS_IF, (%esp) + pushl $__USER_CS + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary: TI_sysenter_return + * is relative to thread_info, which is at the bottom of the + * kernel stack page. 4*4 means the 4 words pushed above; + * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; + * and THREAD_SIZE takes us to the bottom. + */ + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) + + pushl %eax + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault + ASM_STAC +1: movl (%ebp),%ebp + ASM_CLAC + movl %ebp,PT_EBP(%esp) + _ASM_EXTABLE(1b,syscall_fault) + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(NR_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(,%eax,4) +sysenter_after_call: + movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jnz sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ + movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ + /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ + call __audit_syscall_entry + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + call __audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + +.pushsection .fixup,"ax" +2: movl $0,PT_FS(%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) + PTGS_TO_GS_EX +ENDPROC(ia32_sysenter_target) + + # system call handler stub +ENTRY(system_call) + ASM_CLAC + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) +syscall_after_call: + movl %eax,PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jnz syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + je ldt_ss # returning to user-space with LDT SS +#endif +restore_nocheck: + RESTORE_REGS 4 # skip orig_eax/error_code +irq_return: + INTERRUPT_RETURN +.section .fixup,"ax" +ENTRY(iret_exc) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous + _ASM_EXTABLE(irq_return,iret_exc) + +#ifdef CONFIG_X86_ESPFIX32 +ldt_ss: +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + jmp restore_nocheck +#endif +ENDPROC(system_call) + + # perform work that needs to be done immediately before resumption + ALIGN +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jnz work_notifysig_v86 # returning to kernel-space or + # vm86-space +1: +#else + movl %esp, %eax +#endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace + +#ifdef CONFIG_VM86 + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + jmp 1b +#endif +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + +syscall_fault: + ASM_CLAC + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS,%eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,%eax + jmp sysenter_after_call +END(sysenter_badsys) + +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + pushl %eax + lss (%esp), %esp /* switch to the normal stack segment */ +#endif +.endm +.macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +#endif +.endm + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call fn; \ + jmp ret_from_intr; \ +ENDPROC(name) + + +#ifdef CONFIG_TRACING +#define TRACE_BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +#else +#define TRACE_BUILD_INTERRUPT(name, nr) +#endif + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + TRACE_BUILD_INTERRUPT(name, nr) + +/* The include is where all of the SMP etc. interrupts come from */ +#include + +ENTRY(coprocessor_error) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_error + jmp error_code +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + ASM_CLAC + pushl $0 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ + ALTERNATIVE "pushl $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM +#else + pushl $do_simd_coprocessor_error +#endif + jmp error_code +END(simd_coprocessor_error) + +ENTRY(device_not_available) + ASM_CLAC + pushl $-1 # mark this as an int + pushl $do_device_not_available + jmp error_code +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret + _ASM_EXTABLE(native_iret, iret_exc) +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + ASM_CLAC + pushl $0 + pushl $do_overflow + jmp error_code +END(overflow) + +ENTRY(bounds) + ASM_CLAC + pushl $0 + pushl $do_bounds + jmp error_code +END(bounds) + +ENTRY(invalid_op) + ASM_CLAC + pushl $0 + pushl $do_invalid_op + jmp error_code +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + ASM_CLAC + pushl $do_invalid_TSS + jmp error_code +END(invalid_TSS) + +ENTRY(segment_not_present) + ASM_CLAC + pushl $do_segment_not_present + jmp error_code +END(segment_not_present) + +ENTRY(stack_segment) + ASM_CLAC + pushl $do_stack_segment + jmp error_code +END(stack_segment) + +ENTRY(alignment_check) + ASM_CLAC + pushl $do_alignment_check + jmp error_code +END(alignment_check) + +ENTRY(divide_error) + ASM_CLAC + pushl $0 # no error code + pushl $do_divide_error + jmp error_code +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + ASM_CLAC + pushl $0 + pushl machine_check_vector + jmp error_code +END(machine_check) +#endif + +ENTRY(spurious_interrupt_bug) + ASM_CLAC + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code +END(spurious_interrupt_bug) + +#ifdef CONFIG_XEN +/* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + +ENTRY(xen_hypervisor_callback) + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + TRACE_IRQS_OFF + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + jmp xen_iret_crit_fixup + +ENTRY(xen_do_upcall) +1: mov %esp, %eax + call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp ret_from_intr +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + pushl %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ + testl %eax,%eax + popl %eax + lea 16(%esp),%esp + jz 5f + jmp iret_exc +5: pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp ret_from_exception + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous + _ASM_EXTABLE(1b,6b) + _ASM_EXTABLE(2b,7b) + _ASM_EXTABLE(3b,8b) + _ASM_EXTABLE(4b,9b) +ENDPROC(xen_failsafe_callback) + +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + xen_evtchn_do_upcall) + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +#endif /* CONFIG_HYPERV */ + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4,%esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +ftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS,13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret + + popf + jmp ftrace_stub +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif + +#ifdef CONFIG_TRACING +ENTRY(trace_page_fault) + ASM_CLAC + pushl $trace_do_page_fault + jmp error_code +END(trace_page_fault) +#endif + +ENTRY(page_fault) + ASM_CLAC + pushl $do_page_fault + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + pushfl + pushl $__KERNEL_CS + pushl $sysenter_past_esp +.endm + +ENTRY(debug) + ASM_CLAC + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je nmi_espfix_stack +#endif + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + +nmi_stack_fixup: + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +#ifdef CONFIG_X86_ESPFIX32 +nmi_espfix_stack: + /* + * create the pointer to lss back + */ + pushl %ss + pushl %esp + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + .endr + pushl %eax + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + jmp irq_return +#endif +END(nmi) + +ENTRY(int3) + ASM_CLAC + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception +END(int3) + +ENTRY(general_protection) + pushl $do_general_protection + jmp error_code +END(general_protection) + +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + ASM_CLAC + pushl $do_async_page_fault + jmp error_code +END(async_page_fault) +#endif + diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S new file mode 100644 index 000000000000..4ad79e946f5a --- /dev/null +++ b/arch/x86/entry/entry_64.S @@ -0,0 +1,1442 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * Some of this is documented in Documentation/x86/entry_64.txt + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * A note on terminology: + * - iret frame: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * + * Some macro usage: + * - ENTRY/END Define functions in the symbol table. + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - idtentry - Define exception entry points. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + + .code64 + .section .entry.text, "ax" + + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret64) + swapgs + sysretq +ENDPROC(native_usergs_sysret64) +#endif /* CONFIG_PARAVIRT */ + + +.macro TRACE_IRQS_IRETQ +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG + bt $9,EFLAGS(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + +/* + * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. + * + * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Registers on entry: + * rax system call number + * rcx return address + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) + * rdi arg0 + * rsi arg1 + * rdx arg2 + * r10 arg3 (needs to be moved to rcx to conform to C ABI) + * r8 arg4 + * r9 arg5 + * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) + * + * Only called from user space. + * + * When user can change pt_regs->foo always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + +ENTRY(system_call) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs + * for the guest and jump here on syscall. + */ +GLOBAL(system_call_after_swapgs) + + movq %rsp,PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + + /* Construct struct pt_regs on stack */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + /* + * Re-enable interrupts. + * We use 'rsp_scratch' as a scratch space, hence irq-off block above + * must execute atomically in the face of possible interrupt-driven + * task preemption. We must enable interrupts only after we're done + * with using rsp_scratch: + */ + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ + + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz tracesys +system_call_fastpath: +#if __SYSCALL_MASK == ~0 + cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10,%rcx + call *sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: +/* + * Syscall return path ending with SYSRET (fast path). + * Has incompletely filled pt_regs. + */ + LOCKDEP_SYS_EXIT + /* + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + DISABLE_INTERRUPTS(CLBR_NONE) + + /* + * We must check ti flags with interrupts (or at least preemption) + * off because we must *never* return to userspace without + * processing exit work that is enqueued if we're preempted here. + * In particular, returning to userspace with any of the one-shot + * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is + * very bad. + */ + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RIP(%rsp),%rcx + movq EFLAGS(%rsp),%r11 + movq RSP(%rsp),%rsp + /* + * 64bit SYSRET restores rip from rcx, + * rflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * Restoration of rflags re-enables interrupts. + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we should + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. (Actually + * detecting the failure in 64-bit userspace is tricky but can be + * done.) + */ + USERGS_SYSRET64 + + /* Do syscall entry tracing */ +tracesys: + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + call syscall_trace_enter_phase1 + test %rax, %rax + jnz tracesys_phase2 /* if needed, run the slow path */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax + jmp system_call_fastpath /* and return to the fast path */ + +tracesys_phase2: + SAVE_EXTRA_REGS + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + movq %rax,%rdx + call syscall_trace_enter_phase2 + + /* + * Reload registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_entry_phase2() returned + * the value it wants us to use in the table lookup. + */ + RESTORE_C_REGS_EXCEPT_RAX + RESTORE_EXTRA_REGS +#if __SYSCALL_MASK == ~0 + cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + /* Use IRET because user could have changed pt_regs->foo */ + +/* + * Syscall return path ending with IRET. + * Has correct iret frame. + */ +GLOBAL(int_ret_from_sys_call) + DISABLE_INTERRUPTS(CLBR_NONE) +int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ + TRACE_IRQS_OFF + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +GLOBAL(int_with_check) + LOCKDEP_SYS_EXIT_IRQ + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + andl $~TS_COMPAT,TI_status(%rcx) + jmp syscall_return + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + SCHEDULE_USER + popq %rdi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + + /* handle signals and tracing -- both require a full pt_regs */ +int_very_careful: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_EXTRA_REGS + /* Check for syscall exit trace */ + testl $_TIF_WORK_SYSCALL_EXIT,%edx + jz int_signal + pushq %rdi + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi + jmp int_restore_rest + +int_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_WORK_MASK,%edi +int_restore_rest: + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + +syscall_return: + /* The IRETQ could re-enable interrupts: */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. + */ + movq RCX(%rsp),%rcx + movq RIP(%rsp),%r11 + cmpq %rcx,%r11 /* RCX == RIP */ + jne opportunistic_sysret_failed + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * If width of "canonical tail" ever becomes variable, this will need + * to be updated to remain correct on both old and new CPUs. + */ + .ifne __VIRTUAL_MASK_SHIFT - 47 + .error "virtual address width changed -- SYSRET checks need update" + .endif + /* Change top 16 bits to be the sign-extension of 47th bit */ + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne opportunistic_sysret_failed + + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed + + /* + * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. This would cause an infinite loop whenever #DB happens + * with register state that satisfies the opportunistic SYSRET + * conditions. For example, single-stepping this user code: + * + * movq $stuck_here,%rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed + + /* nothing to check for RSP */ + + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RSP(%rsp),%rsp + USERGS_SYSRET64 + +opportunistic_sysret_failed: + SWAPGS + jmp restore_c_regs_and_iret +END(system_call) + + + .macro FORK_LIKE func +ENTRY(stub_\func) + SAVE_EXTRA_REGS 8 + jmp sys_\func +END(stub_\func) + .endm + + FORK_LIKE clone + FORK_LIKE fork + FORK_LIKE vfork + +ENTRY(stub_execve) + call sys_execve +return_from_execve: + testl %eax, %eax + jz 1f + /* exec failed, can use fast SYSRET code path in this case */ + ret +1: + /* must use IRET code path (pt_regs->cs may have changed) */ + addq $8, %rsp + ZERO_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call +END(stub_execve) +/* + * Remaining execve stubs are only 7 bytes long. + * ENTRY() often aligns to 16 bytes, which in this case has no benefits. + */ + .align 8 +GLOBAL(stub_execveat) + call sys_execveat + jmp return_from_execve +END(stub_execveat) + +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) + .align 8 +GLOBAL(stub_x32_execve) +GLOBAL(stub32_execve) + call compat_sys_execve + jmp return_from_execve +END(stub32_execve) +END(stub_x32_execve) + .align 8 +GLOBAL(stub_x32_execveat) +GLOBAL(stub32_execveat) + call compat_sys_execveat + jmp return_from_execve +END(stub32_execveat) +END(stub_x32_execveat) +#endif + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + /* + * SAVE_EXTRA_REGS result is not normally needed: + * sigreturn overwrites all pt_regs->GPREGS. + * But sigreturn can fail (!), and there is no easy way to detect that. + * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, + * we SAVE_EXTRA_REGS here. + */ + SAVE_EXTRA_REGS 8 + call sys_rt_sigreturn +return_from_stub: + addq $8, %rsp + RESTORE_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call +END(stub_rt_sigreturn) + +#ifdef CONFIG_X86_X32_ABI +ENTRY(stub_x32_rt_sigreturn) + SAVE_EXTRA_REGS 8 + call sys32_x32_rt_sigreturn + jmp return_from_stub +END(stub_x32_rt_sigreturn) +#endif + +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq $0x0002 + popfq # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + RESTORE_EXTRA_REGS + + testb $3, CS(%rsp) # from kernel_thread? + + /* + * By the time we get here, we have no idea whether our pt_regs, + * ti flags, and ti status came from the 64-bit SYSCALL fast path, + * the slow path, or one of the ia32entry paths. + * Use IRET code path to return, since it can safely handle + * all of the above. + */ + jnz int_ret_from_sys_call + + /* We came from kernel_thread */ + /* nb: we depend on RESTORE_EXTRA_REGS above */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) + RESTORE_EXTRA_REGS + jmp int_ret_from_sys_call +END(ret_from_fork) + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushq $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +/* + * Interrupt entry/exit. + * + * Interrupt entry points save only callee clobbered registers in fast path. + * + * Entry runs with interrupts off. + */ + +/* 0(%rsp): ~(interrupt number) */ + .macro interrupt func + cld + /* + * Since nothing in interrupt handling code touches r12...r15 members + * of "struct pt_regs", and since interrupts can nest, we can save + * four stack slots and simultaneously provide + * an unwind-friendly stack layout by saving "truncated" pt_regs + * exactly up to rbp slot, without these members. + */ + ALLOC_PT_GPREGS_ON_STACK -RBP + SAVE_C_REGS -RBP + /* this goes to 0(%rsp) for unwinder, not for saving the value: */ + SAVE_EXTRA_REGS_RBP -RBP + + leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ + + testb $3, CS-RBP(%rsp) + jz 1f + SWAPGS +1: + /* + * Save previous stack pointer, optionally switch to interrupt stack. + * irq_count is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rsi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF + + call \func + .endm + + /* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_interrupt. + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + interrupt do_IRQ + /* 0(%rsp): old RSP */ +ret_from_intr: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + decl PER_CPU_VAR(irq_count) + + /* Restore saved previous stack */ + popq %rsi + /* return code expects complete pt_regs - adjust rsp accordingly: */ + leaq -RBP(%rsi),%rsp + + testb $3, CS(%rsp) + jz retint_kernel + /* Interrupt came from user space */ +retint_user: + GET_THREAD_INFO(%rcx) + /* + * %rcx: thread info. Interrupts off. + */ +retint_with_reschedule: + movl $_TIF_WORK_MASK,%edi +retint_check: + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz retint_careful + +retint_swapgs: /* return to user-space */ + /* + * The iretq could re-enable interrupts: + */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + SWAPGS + jmp restore_c_regs_and_iret + +/* Returning to kernel space */ +retint_kernel: +#ifdef CONFIG_PREEMPT + /* Interrupts are off */ + /* Check if we need preemption */ + bt $9,EFLAGS(%rsp) /* interrupts were off? */ + jnc 1f +0: cmpl $0,PER_CPU_VAR(__preempt_count) + jnz 1f + call preempt_schedule_irq + jmp 0b +1: +#endif + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ + +/* + * At this label, code paths which return to kernel and to user, + * which come from interrupts/exception and from syscalls, merge. + */ +restore_c_regs_and_iret: + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + +irq_return: + INTERRUPT_RETURN + +ENTRY(native_iret) + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ +#ifdef CONFIG_X86_ESPFIX64 + testb $4,(SS-RIP)(%rsp) + jnz native_irq_return_ldt +#endif + +.global native_irq_return_iret +native_irq_return_iret: + /* + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. + */ + iretq + +#ifdef CONFIG_X86_ESPFIX64 +native_irq_return_ldt: + pushq %rax + pushq %rdi + SWAPGS + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq %rax + jmp native_irq_return_iret +#endif + + /* edi: workmask, edx: work */ +retint_careful: + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + SCHEDULE_USER + popq %rdi + GET_THREAD_INFO(%rcx) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp retint_check + +retint_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz retint_swapgs + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_EXTRA_REGS + movq $-1,ORIG_RAX(%rsp) + xorl %esi,%esi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_with_reschedule + +END(common_interrupt) + +/* + * APIC interrupts. + */ +.macro apicinterrupt3 num sym do_sym +ENTRY(\sym) + ASM_CLAC + pushq $~(\num) +.Lcommon_\sym: + interrupt \do_sym + jmp ret_from_intr +END(\sym) +.endm + +#ifdef CONFIG_TRACING +#define trace(sym) trace_##sym +#define smp_trace(sym) smp_trace_##sym + +.macro trace_apicinterrupt num sym +apicinterrupt3 \num trace(\sym) smp_trace(\sym) +.endm +#else +.macro trace_apicinterrupt num sym do_sym +.endm +#endif + +.macro apicinterrupt num sym do_sym +apicinterrupt3 \num \sym \do_sym +trace_apicinterrupt \num \sym +.endm + +#ifdef CONFIG_SMP +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt +#endif + +#ifdef CONFIG_X86_UV +apicinterrupt3 UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +#endif +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi + +#ifdef CONFIG_HAVE_KVM +apicinterrupt3 POSTED_INTR_VECTOR \ + kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt +#endif + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif + +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt + +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt +#endif + +/* + * Exception entry points. + */ +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) + +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +ENTRY(\sym) + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif + + ASM_CLAC + PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq $-1 /* ORIG_RAX: no syscall to restart */ + .endif + + ALLOC_PT_GPREGS_ON_STACK + + .if \paranoid + .if \paranoid == 1 + testb $3, CS(%rsp) /* If coming from userspace, switch */ + jnz 1f /* stacks. */ + .endif + call paranoid_entry + .else + call error_entry + .endif + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + .endif + + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + .endif + + /* these procedures expect "no swapgs" flag in ebx */ + .if \paranoid + jmp paranoid_exit + .else + jmp error_exit + .endif + + .if \paranoid == 1 + /* + * Paranoid entry from userspace. Switch stacks and treat it + * as a normal entry. This means that paranoid handlers + * run in real process context if user_mode(regs). + */ +1: + call error_entry + + + movq %rsp,%rdi /* pt_regs pointer */ + call sync_regs + movq %rax,%rsp /* switch stack */ + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + call \do_sym + + jmp error_exit /* %ebx: no swapgs flag */ + .endif +END(\sym) +.endm + +#ifdef CONFIG_TRACING +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#else +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#endif + +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + + + /* Reload gs selector with exception handling */ + /* edi: new selector */ +ENTRY(native_load_gs_index) + pushfq + DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + SWAPGS +gs_change: + movl %edi,%gs +2: mfence /* workaround */ + SWAPGS + popfq + ret +END(native_load_gs_index) + + _ASM_EXTABLE(gs_change,bad_gs) + .section .fixup,"ax" + /* running with kernelgs */ +bad_gs: + SWAPGS /* switch back to user gs */ + xorl %eax,%eax + movl %eax,%gs + jmp 2b + .previous + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(do_softirq_own_stack) + pushq %rbp + mov %rsp,%rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr),%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + decl PER_CPU_VAR(irq_count) + ret +END(do_softirq_own_stack) + +#ifdef CONFIG_XEN +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 + +/* + * A note on the "critical region" in our callback handler. + * We want to avoid stacking callback handlers due to events occurring + * during handling of the last event. To do this, we keep events disabled + * until we've done all processing. HOWEVER, we must enable events before + * popping the stack frame (can't be done atomically) and so it would still + * be possible to get enough handler activations to overflow the stack. + * Although unlikely, bugs of that kind are hard to track down, so we'd + * like to avoid the possibility. + * So, on entry to the handler we detect whether we interrupted an + * existing activation in its critical region -- if so, we pop the current + * activation and restart the handler using the previous one. + */ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) +/* + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ + movq %rdi, %rsp # we don't return, adjust the stack frame +11: incl PER_CPU_VAR(irq_count) + movq %rsp,%rbp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + decl PER_CPU_VAR(irq_count) +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp error_exit +END(xen_do_hypervisor_callback) + +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we do not need to fix up as Xen has already reloaded all segment + * registers that could be reloaded and zeroed the others. + * Category 2 we fix up by killing the current process. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by comparing each saved segment register + * with its current contents: any discrepancy means we in category 1. + */ +ENTRY(xen_failsafe_callback) + movl %ds,%ecx + cmpw %cx,0x10(%rsp) + jne 1f + movl %es,%ecx + cmpw %cx,0x18(%rsp) + jne 1f + movl %fs,%ecx + cmpw %cx,0x20(%rsp) + jne 1f + movl %gs,%ecx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x30,%rsp + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx + jmp general_protection +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x30,%rsp + pushq $-1 /* orig_ax = -1 => not a system call */ + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS + jmp error_exit +END(xen_failsafe_callback) + +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + xen_hvm_callback_vector xen_evtchn_do_upcall + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + hyperv_callback_vector hyperv_vector_handler +#endif /* CONFIG_HYPERV */ + +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 +#ifdef CONFIG_XEN +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 +#endif +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 +#ifdef CONFIG_KVM_GUEST +idtentry async_page_fault do_async_page_fault has_error_code=1 +#endif +#ifdef CONFIG_X86_MCE +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +#endif + +/* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(paranoid_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret +END(paranoid_entry) + +/* + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +ENTRY(paranoid_exit) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF_DEBUG + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs + TRACE_IRQS_IRETQ + SWAPGS_UNSAFE_STACK + jmp paranoid_exit_restore +paranoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG +paranoid_exit_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + INTERRUPT_RETURN +END(paranoid_exit) + +/* + * Save all registers in pt_regs, and switch gs if needed. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(error_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + xorl %ebx,%ebx + testb $3, CS+8(%rsp) + jz error_kernelspace +error_swapgs: + SWAPGS +error_sti: + TRACE_IRQS_OFF + ret + + /* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. + */ +error_kernelspace: + incl %ebx + leaq native_irq_return_iret(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_bad_iret + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + /* fall through */ + +error_bad_iret: + SWAPGS + mov %rsp,%rdi + call fixup_bad_iret + mov %rax,%rsp + decl %ebx /* Return to usergs */ + jmp error_sti +END(error_entry) + + +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +ENTRY(error_exit) + movl %ebx,%eax + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %eax,%eax + jnz retint_kernel + jmp retint_user +END(error_exit) + +/* Runs on exception stack */ +ENTRY(nmi) + PARAVIRT_ADJUST_EXCEPTION_FRAME + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as our temp variable throughout */ + pushq %rdx + + /* + * If %cs was not the kernel segment, then the NMI triggered in user + * space, which means it is definitely not nested. + */ + cmpl $__KERNEL_CS, 16(%rsp) + jne first_nmi + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmpl $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ + cmpq %rdx, 4*8(%rsp) + /* If the stack pointer is above the NMI stack, this is a normal NMI */ + ja first_nmi + subq $EXCEPTION_STKSZ, %rdx + cmpq %rdx, 4*8(%rsp) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi + /* Ah, it is within the NMI stack, treat it as nested */ + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + leaq -10*8(%rsp), %rdx + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi + + /* Put stack back */ + addq $(6*8), %rsp + +nested_nmi_out: + popq %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved stack frame is used to fix up the copied stack frame + * that a nested NMI may change to make the interrupted NMI iret jump + * to the repeat_nmi. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ + movq (%rsp), %rdx + + /* Set the NMI executing variable on the stack. */ + pushq $1 + + /* + * Leave room for the "copied" frame + */ + subq $(5*8), %rsp + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq 11*8(%rsp) + .endr + + /* Everything up to here is safe from nested NMIs */ + + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + */ +repeat_nmi: + /* + * Update the stack variable to say we are still in NMI (the update + * is benign for the non-repeat case, where 1 was pushed just above + * to this very stack slot). + */ + movq $1, 10*8(%rsp) + + /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + .rept 5 + pushq -6*8(%rsp) + .endr + subq $(5*8), %rsp +end_repeat_nmi: + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception and reset our iret stack + * so that we repeat another NMI. + */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ + ALLOC_PT_GPREGS_ON_STACK + + /* + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ + call paranoid_entry + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi + call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + /* Pop the extra iret frame at once */ + REMOVE_PT_GPREGS_FROM_STACK 6*8 + + /* Clear the NMI executing stack variable */ + movq $0, 5*8(%rsp) + jmp irq_return +END(nmi) + +ENTRY(ignore_sysret) + mov $-ENOSYS,%eax + sysret +END(ignore_sysret) + diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9bcd0b56ca17..9d3ee054453d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n CFLAGS_irq.o := -I$(src)/../include/asm/trace -obj-y := process_$(BITS).o signal.o entry_$(BITS).o +obj-y := process_$(BITS).o signal.o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S deleted file mode 100644 index 0ac73de925d1..000000000000 --- a/arch/x86/kernel/entry_32.S +++ /dev/null @@ -1,1249 +0,0 @@ -/* - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. - * - * Stack layout in 'syscall_exit': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, - * ptrace.c and ptrace.h - * - * 0(%esp) - %ebx - * 4(%esp) - %ecx - * 8(%esp) - %edx - * C(%esp) - %esi - * 10(%esp) - %edi - * 14(%esp) - %ebp - * 18(%esp) - %eax - * 1C(%esp) - %ds - * 20(%esp) - %es - * 24(%esp) - %fs - * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS - * 2C(%esp) - orig_eax - * 30(%esp) - %eip - * 34(%esp) - %cs - * 38(%esp) - %eflags - * 3C(%esp) - %oldesp - * 40(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysenter_audit syscall_trace_entry -#define sysexit_audit syscall_exit_work -#endif - - .section .entry.text, "ax" - -/* - * We use macros for low-level operations which need to be overridden - * for paravirtualization. The following will never clobber any registers: - * INTERRUPT_RETURN (aka. "iret") - * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). - * - * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must - * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). - * Allowing a register to be clobbered can shrink the paravirt replacement - * enough to patch inline, increasing performance. - */ - -#ifdef CONFIG_PREEMPT -#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -#else -#define preempt_stop(clobbers) -#define resume_kernel restore_all -#endif - -.macro TRACE_IRQS_IRET -#ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? - jz 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * User gs save/restore - * - * %gs is used for userland TLS and kernel only uses it for stack - * canary which is required to be at %gs:20 by gcc. Read the comment - * at the top of stackprotector.h for more info. - * - * Local labels 98 and 99 are used. - */ -#ifdef CONFIG_X86_32_LAZY_GS - - /* unfortunately push/pop can't be no-op */ -.macro PUSH_GS - pushl $0 -.endm -.macro POP_GS pop=0 - addl $(4 + \pop), %esp -.endm -.macro POP_GS_EX -.endm - - /* all the rest are no-op */ -.macro PTGS_TO_GS -.endm -.macro PTGS_TO_GS_EX -.endm -.macro GS_TO_REG reg -.endm -.macro REG_TO_PTGS reg -.endm -.macro SET_KERNEL_GS reg -.endm - -#else /* CONFIG_X86_32_LAZY_GS */ - -.macro PUSH_GS - pushl %gs -.endm - -.macro POP_GS pop=0 -98: popl %gs - .if \pop <> 0 - add $\pop, %esp - .endif -.endm -.macro POP_GS_EX -.pushsection .fixup, "ax" -99: movl $0, (%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro PTGS_TO_GS -98: mov PT_GS(%esp), %gs -.endm -.macro PTGS_TO_GS_EX -.pushsection .fixup, "ax" -99: movl $0, PT_GS(%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro GS_TO_REG reg - movl %gs, \reg -.endm -.macro REG_TO_PTGS reg - movl \reg, PT_GS(%esp) -.endm -.macro SET_KERNEL_GS reg - movl $(__KERNEL_STACK_CANARY), \reg - movl \reg, %gs -.endm - -#endif /* CONFIG_X86_32_LAZY_GS */ - -.macro SAVE_ALL - cld - PUSH_GS - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - movl $(__USER_DS), %edx - movl %edx, %ds - movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs - SET_KERNEL_GS %edx -.endm - -.macro RESTORE_INT_REGS - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax -.endm - -.macro RESTORE_REGS pop=0 - RESTORE_INT_REGS -1: popl %ds -2: popl %es -3: popl %fs - POP_GS \pop -.pushsection .fixup, "ax" -4: movl $0, (%esp) - jmp 1b -5: movl $0, (%esp) - jmp 2b -6: movl $0, (%esp) - jmp 3b -.popsection - _ASM_EXTABLE(1b,4b) - _ASM_EXTABLE(2b,5b) - _ASM_EXTABLE(3b,6b) - POP_GS_EX -.endm - -ENTRY(ret_from_fork) - pushl %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags - popfl - jmp syscall_exit -END(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - pushl %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags - popfl - movl PT_EBP(%esp),%eax - call *PT_EBX(%esp) - movl $0,PT_EAX(%esp) - jmp syscall_exit -ENDPROC(ret_from_kernel_thread) - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing - ALIGN -ret_from_exception: - preempt_stop(CLBR_ANY) -ret_from_intr: - GET_THREAD_INFO(%ebp) -#ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax -#else - /* - * We can be coming here from child spawned by kernel_thread(). - */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax -#endif - cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace - -ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending - jmp restore_all -END(ret_from_exception) - -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz restore_all - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all - call preempt_schedule_irq - jmp need_resched -END(resume_kernel) -#endif - -/* SYSENTER_RETURN points to after the "sysenter" instruction in - the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ - - # sysenter call handler stub -ENTRY(ia32_sysenter_target) - movl TSS_sysenter_sp0(%esp),%esp -sysenter_past_esp: - /* - * Interrupts are disabled here, but we can't trace it until - * enough kernel state to call TRACE_IRQS_OFF can be called - but - * we immediately enable interrupts at that point anyway. - */ - pushl $__USER_DS - pushl %ebp - pushfl - orl $X86_EFLAGS_IF, (%esp) - pushl $__USER_CS - /* - * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary: TI_sysenter_return - * is relative to thread_info, which is at the bottom of the - * kernel stack page. 4*4 means the 4 words pushed above; - * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; - * and THREAD_SIZE takes us to the bottom. - */ - pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - - pushl %eax - SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault - ASM_STAC -1: movl (%ebp),%ebp - ASM_CLAC - movl %ebp,PT_EBP(%esp) - _ASM_EXTABLE(1b,syscall_fault) - - GET_THREAD_INFO(%ebp) - - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz sysenter_audit -sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(,%eax,4) -sysenter_after_call: - movl %eax,PT_EAX(%esp) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit -sysenter_exit: -/* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp,%ebp - TRACE_IRQS_ON -1: mov PT_FS(%esp), %fs - PTGS_TO_GS - ENABLE_INTERRUPTS_SYSEXIT - -#ifdef CONFIG_AUDITSYSCALL -sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl PT_ESI(%esp) /* a3: 5th arg */ - pushl PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl %ecx /* get that remapped edx off the stack */ - popl %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp),%eax /* reload syscall number */ - jmp sysenter_do_call - -sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax,%edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al,%eax /* zero-extend that */ - call __audit_syscall_exit - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp),%eax /* reload syscall return value */ - jmp sysenter_exit -#endif - -.pushsection .fixup,"ax" -2: movl $0,PT_FS(%esp) - jmp 1b -.popsection - _ASM_EXTABLE(1b,2b) - PTGS_TO_GS_EX -ENDPROC(ia32_sysenter_target) - - # system call handler stub -ENTRY(system_call) - ASM_CLAC - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(,%eax,4) -syscall_after_call: - movl %eax,PT_EAX(%esp) # store the return value -syscall_exit: - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work - -restore_all: - TRACE_IRQS_IRET -restore_all_notrace: -#ifdef CONFIG_X86_ESPFIX32 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: PT_OLDSS(%esp) contains the wrong/random values if we - # are returning to the kernel. - # See comments in process.c:copy_thread() for details. - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS -#endif -restore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: - INTERRUPT_RETURN -.section .fixup,"ax" -ENTRY(iret_exc) - pushl $0 # no error code - pushl $do_iret_error - jmp error_code -.previous - _ASM_EXTABLE(irq_return,iret_exc) - -#ifdef CONFIG_X86_ESPFIX32 -ldt_ss: -#ifdef CONFIG_PARAVIRT - /* - * The kernel can't run on a non-flat stack if paravirt mode - * is active. Rather than try to fixup the high bits of - * ESP, bypass this code entirely. This may break DOSemu - * and/or Wine support in a paravirt VM, although the option - * is still available to implement the setting of the high - * 16-bits in the INTERRUPT_RETURN paravirt-op. - */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck -#endif - -/* - * Setup and switch to ESPFIX stack - * - * We're returning to userspace with a 16 bit stack. The CPU will not - * restore the high word of ESP for us on executing iret... This is an - * "official" bug of all the x86-compatible CPUs, which we can work - * around to make dosemu and wine happy. We do this by preloading the - * high word of ESP with the high word of the userspace ESP while - * compensating for the offset by changing to the ESPFIX segment with - * a base address that matches for the difference. - */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - pushl %eax /* new kernel esp */ - /* Disable interrupts, but do not irqtrace this section: we - * will soon execute iret and the tracer was already set to - * the irqstate after the iret */ - DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck -#endif -ENDPROC(system_call) - - # perform work that needs to be done immediately before resumption - ALIGN -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests -#ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space -1: -#else - movl %esp, %eax -#endif - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl - jb resume_kernel - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace - -#ifdef CONFIG_VM86 - ALIGN -work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - jmp 1b -#endif -END(work_pending) - - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS,PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter - /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit -END(syscall_trace_entry) - - # perform syscall exit tracing - ALIGN -syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead - movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace -END(syscall_exit_work) - -syscall_fault: - ASM_CLAC - GET_THREAD_INFO(%ebp) - movl $-EFAULT,PT_EAX(%esp) - jmp resume_userspace -END(syscall_fault) - -syscall_badsys: - movl $-ENOSYS,%eax - jmp syscall_after_call -END(syscall_badsys) - -sysenter_badsys: - movl $-ENOSYS,%eax - jmp sysenter_after_call -END(sysenter_badsys) - -.macro FIXUP_ESPFIX_STACK -/* - * Switch back for ESPFIX stack to the normal zerobased stack - * - * We can't call C functions using the ESPFIX stack. This code reads - * the high word of the segment base from the GDT and swiches to the - * normal stack and adjusts ESP with the matching offset. - */ -#ifdef CONFIG_X86_ESPFIX32 - /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ - shl $16, %eax - addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - pushl %eax - lss (%esp), %esp /* switch to the normal stack segment */ -#endif -.endm -.macro UNWIND_ESPFIX_STACK -#ifdef CONFIG_X86_ESPFIX32 - movl %ss, %eax - /* see if on espfix stack */ - cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - /* switch to normal stack */ - FIXUP_ESPFIX_STACK -27: -#endif -.endm - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -END(irq_entries_start) - -/* - * the CPU automatically disables interrupts when executing an IRQ vector, - * so IRQ-flags tracing has to follow that: - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - ASM_CLAC - addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ - SAVE_ALL - TRACE_IRQS_OFF - movl %esp,%eax - call do_IRQ - jmp ret_from_intr -ENDPROC(common_interrupt) - -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL; \ - TRACE_IRQS_OFF \ - movl %esp,%eax; \ - call fn; \ - jmp ret_from_intr; \ -ENDPROC(name) - - -#ifdef CONFIG_TRACING -#define TRACE_BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -#define TRACE_BUILD_INTERRUPT(name, nr) -#endif - -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) - -/* The include is where all of the SMP etc. interrupts come from */ -#include - -ENTRY(coprocessor_error) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_error - jmp error_code -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - ASM_CLAC - pushl $0 -#ifdef CONFIG_X86_INVD_BUG - /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ - X86_FEATURE_XMM -#else - pushl $do_simd_coprocessor_error -#endif - jmp error_code -END(simd_coprocessor_error) - -ENTRY(device_not_available) - ASM_CLAC - pushl $-1 # mark this as an int - pushl $do_device_not_available - jmp error_code -END(device_not_available) - -#ifdef CONFIG_PARAVIRT -ENTRY(native_iret) - iret - _ASM_EXTABLE(native_iret, iret_exc) -END(native_iret) - -ENTRY(native_irq_enable_sysexit) - sti - sysexit -END(native_irq_enable_sysexit) -#endif - -ENTRY(overflow) - ASM_CLAC - pushl $0 - pushl $do_overflow - jmp error_code -END(overflow) - -ENTRY(bounds) - ASM_CLAC - pushl $0 - pushl $do_bounds - jmp error_code -END(bounds) - -ENTRY(invalid_op) - ASM_CLAC - pushl $0 - pushl $do_invalid_op - jmp error_code -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp error_code -END(coprocessor_segment_overrun) - -ENTRY(invalid_TSS) - ASM_CLAC - pushl $do_invalid_TSS - jmp error_code -END(invalid_TSS) - -ENTRY(segment_not_present) - ASM_CLAC - pushl $do_segment_not_present - jmp error_code -END(segment_not_present) - -ENTRY(stack_segment) - ASM_CLAC - pushl $do_stack_segment - jmp error_code -END(stack_segment) - -ENTRY(alignment_check) - ASM_CLAC - pushl $do_alignment_check - jmp error_code -END(alignment_check) - -ENTRY(divide_error) - ASM_CLAC - pushl $0 # no error code - pushl $do_divide_error - jmp error_code -END(divide_error) - -#ifdef CONFIG_X86_MCE -ENTRY(machine_check) - ASM_CLAC - pushl $0 - pushl machine_check_vector - jmp error_code -END(machine_check) -#endif - -ENTRY(spurious_interrupt_bug) - ASM_CLAC - pushl $0 - pushl $do_spurious_interrupt_bug - jmp error_code -END(spurious_interrupt_bug) - -#ifdef CONFIG_XEN -/* Xen doesn't set %esp to be precisely what the normal sysenter - entrypoint expects, so fix it up before using the normal path. */ -ENTRY(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp - -ENTRY(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - TRACE_IRQS_OFF - - /* Check to see if we got the event in the critical - region in xen_iret_direct, after we've reenabled - events and checked for pending events. This simulates - iret instruction's behaviour where it delivers a - pending interrupt when enabling interrupts. */ - movl PT_EIP(%esp),%eax - cmpl $xen_iret_start_crit,%eax - jb 1f - cmpl $xen_iret_end_crit,%eax - jae 1f - - jmp xen_iret_crit_fixup - -ENTRY(xen_do_upcall) -1: mov %esp, %eax - call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp ret_from_intr -ENDPROC(xen_hypervisor_callback) - -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we fix up by reattempting the load, and zeroing the segment -# register if the load fails. -# Category 2 we fix up by jumping to do_iret_error. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by maintaining a status value in EAX. -ENTRY(xen_failsafe_callback) - pushl %eax - movl $1,%eax -1: mov 4(%esp),%ds -2: mov 8(%esp),%es -3: mov 12(%esp),%fs -4: mov 16(%esp),%gs - /* EAX == 0 => Category 1 (Bad segment) - EAX != 0 => Category 2 (Bad IRET) */ - testl %eax,%eax - popl %eax - lea 16(%esp),%esp - jz 5f - jmp iret_exc -5: pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - jmp ret_from_exception - -.section .fixup,"ax" -6: xorl %eax,%eax - movl %eax,4(%esp) - jmp 1b -7: xorl %eax,%eax - movl %eax,8(%esp) - jmp 2b -8: xorl %eax,%eax - movl %eax,12(%esp) - jmp 3b -9: xorl %eax,%eax - movl %eax,16(%esp) - jmp 4b -.previous - _ASM_EXTABLE(1b,6b) - _ASM_EXTABLE(2b,7b) - _ASM_EXTABLE(3b,8b) - _ASM_EXTABLE(4b,9b) -ENDPROC(xen_failsafe_callback) - -BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) - -BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) - -#endif /* CONFIG_HYPERV */ - -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(mcount) - ret -END(mcount) - -ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax - -.globl ftrace_call -ftrace_call: - call ftrace_stub - - addl $4,%esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax -ftrace_ret: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - jmp ftrace_stub -#endif - -.globl ftrace_stub -ftrace_stub: - ret -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - pushf /* push flags before compare (in cs location) */ - - /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * ip location, and move flags into the return ip location. - */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS,13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret - - popf - jmp ftrace_stub -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ - - cmpl $ftrace_stub, ftrace_trace_function - jnz trace -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif -.globl ftrace_stub -ftrace_stub: - ret - - /* taken from glibc */ -trace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax - ret -END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx -#endif - -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp error_code -END(trace_page_fault) -#endif - -ENTRY(page_fault) - ASM_CLAC - pushl $do_page_fault - ALIGN -error_code: - /* the function address is in %gs's slot on the stack */ - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception -END(page_fault) - -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -.macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok -\label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp - pushfl - pushl $__KERNEL_CS - pushl $sysenter_past_esp -.endm - -ENTRY(debug) - ASM_CLAC - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn -debug_stack_correct: - pushl $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception -END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -ENTRY(nmi) - ASM_CLAC -#ifdef CONFIG_X86_ESPFIX32 - pushl %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - je nmi_espfix_stack -#endif - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl %eax - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - pushl %eax - SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_all_notrace - -nmi_stack_fixup: - FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct - -nmi_debug_stack_check: - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct - -#ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: - /* - * create the pointer to lss back - */ - pushl %ss - pushl %esp - addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - .endr - pushl %eax - SAVE_ALL - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return -#endif -END(nmi) - -ENTRY(int3) - ASM_CLAC - pushl $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception -END(int3) - -ENTRY(general_protection) - pushl $do_general_protection - jmp error_code -END(general_protection) - -#ifdef CONFIG_KVM_GUEST -ENTRY(async_page_fault) - ASM_CLAC - pushl $do_async_page_fault - jmp error_code -END(async_page_fault) -#endif - diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S deleted file mode 100644 index 4ad79e946f5a..000000000000 --- a/arch/x86/kernel/entry_64.S +++ /dev/null @@ -1,1442 +0,0 @@ -/* - * linux/arch/x86_64/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * Copyright (C) 2000 Pavel Machek - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * - * Some of this is documented in Documentation/x86/entry_64.txt - * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * - * A note on terminology: - * - iret frame: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. - * - * Some macro usage: - * - ENTRY/END Define functions in the symbol table. - * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - idtentry - Define exception entry points. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - - .code64 - .section .entry.text, "ax" - - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret64) - swapgs - sysretq -ENDPROC(native_usergs_sysret64) -#endif /* CONFIG_PARAVIRT */ - - -.macro TRACE_IRQS_IRETQ -#ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * When dynamic function tracer is enabled it will add a breakpoint - * to all locations that it is about to modify, sync CPUs, update - * all the code, sync CPUs, then remove the breakpoints. In this time - * if lockdep is enabled, it might jump back into the debug handler - * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). - * - * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to - * make sure the stack pointer does not get reset back to the top - * of the debug stack, and instead just reuses the current stack. - */ -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) - -.macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero - TRACE_IRQS_OFF - call debug_stack_reset -.endm - -.macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero - TRACE_IRQS_ON - call debug_stack_reset -.endm - -.macro TRACE_IRQS_IRETQ_DEBUG - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON_DEBUG -1: -.endm - -#else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ -#endif - -/* - * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. - * - * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Registers on entry: - * rax system call number - * rcx return address - * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) - * rdi arg0 - * rsi arg1 - * rdx arg2 - * r10 arg3 (needs to be moved to rcx to conform to C ABI) - * r8 arg4 - * r9 arg5 - * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) - * - * Only called from user space. - * - * When user can change pt_regs->foo always force IRET. That is because - * it deals with uncanonical addresses better. SYSRET has trouble - * with them due to bugs in both AMD and Intel CPUs. - */ - -ENTRY(system_call) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(system_call_after_swapgs) - - movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp - - /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - /* - * Re-enable interrupts. - * We use 'rsp_scratch' as a scratch space, hence irq-off block above - * must execute atomically in the face of possible interrupt-driven - * task preemption. We must enable interrupts only after we're done - * with using rsp_scratch: - */ - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - pushq %r9 /* pt_regs->r9 */ - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys -system_call_fastpath: -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax -#else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: -/* - * Syscall return path ending with SYSRET (fast path). - * Has incompletely filled pt_regs. - */ - LOCKDEP_SYS_EXIT - /* - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - DISABLE_INTERRUPTS(CLBR_NONE) - - /* - * We must check ti flags with interrupts (or at least preemption) - * off because we must *never* return to userspace without - * processing exit work that is enqueued if we're preempted here. - * In particular, returning to userspace with any of the one-shot - * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is - * very bad. - */ - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ - - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp),%rcx - movq EFLAGS(%rsp),%r11 - movq RSP(%rsp),%rsp - /* - * 64bit SYSRET restores rip from rcx, - * rflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * Restoration of rflags re-enables interrupts. - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we should - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. (Actually - * detecting the failure in 64-bit userspace is tricky but can be - * done.) - */ - USERGS_SYSRET64 - - /* Do syscall entry tracing */ -tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp system_call_fastpath /* and return to the fast path */ - -tracesys_phase2: - SAVE_EXTRA_REGS - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax,%rdx - call syscall_trace_enter_phase2 - - /* - * Reload registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_entry_phase2() returned - * the value it wants us to use in the table lookup. - */ - RESTORE_C_REGS_EXCEPT_RAX - RESTORE_EXTRA_REGS -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax -#else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - /* Use IRET because user could have changed pt_regs->foo */ - -/* - * Syscall return path ending with IRET. - * Has correct iret frame. - */ -GLOBAL(int_ret_from_sys_call) - DISABLE_INTERRUPTS(CLBR_NONE) -int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ - TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK,%edi - /* edi: mask to check */ -GLOBAL(int_with_check) - LOCKDEP_SYS_EXIT_IRQ - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) - jmp syscall_return - - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ -int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - - /* handle signals and tracing -- both require a full pt_regs */ -int_very_careful: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS - /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT,%edx - jz int_signal - pushq %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi - jmp int_restore_rest - -int_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_WORK_MASK,%edi -int_restore_rest: - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - -syscall_return: - /* The IRETQ could re-enable interrupts: */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ - - /* - * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. - */ - movq RCX(%rsp),%rcx - movq RIP(%rsp),%r11 - cmpq %rcx,%r11 /* RCX == RIP */ - jne opportunistic_sysret_failed - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- SYSRET checks need update" - .endif - /* Change top 16 bits to be the sign-extension of 47th bit */ - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne opportunistic_sysret_failed - - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed - - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed - - /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: - * - * movq $stuck_here,%rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed - - /* nothing to check for RSP */ - - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed - - /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. - */ -syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp),%rsp - USERGS_SYSRET64 - -opportunistic_sysret_failed: - SWAPGS - jmp restore_c_regs_and_iret -END(system_call) - - - .macro FORK_LIKE func -ENTRY(stub_\func) - SAVE_EXTRA_REGS 8 - jmp sys_\func -END(stub_\func) - .endm - - FORK_LIKE clone - FORK_LIKE fork - FORK_LIKE vfork - -ENTRY(stub_execve) - call sys_execve -return_from_execve: - testl %eax, %eax - jz 1f - /* exec failed, can use fast SYSRET code path in this case */ - ret -1: - /* must use IRET code path (pt_regs->cs may have changed) */ - addq $8, %rsp - ZERO_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_execve) -/* - * Remaining execve stubs are only 7 bytes long. - * ENTRY() often aligns to 16 bytes, which in this case has no benefits. - */ - .align 8 -GLOBAL(stub_execveat) - call sys_execveat - jmp return_from_execve -END(stub_execveat) - -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) - .align 8 -GLOBAL(stub_x32_execve) -GLOBAL(stub32_execve) - call compat_sys_execve - jmp return_from_execve -END(stub32_execve) -END(stub_x32_execve) - .align 8 -GLOBAL(stub_x32_execveat) -GLOBAL(stub32_execveat) - call compat_sys_execveat - jmp return_from_execve -END(stub32_execveat) -END(stub_x32_execveat) -#endif - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) - /* - * SAVE_EXTRA_REGS result is not normally needed: - * sigreturn overwrites all pt_regs->GPREGS. - * But sigreturn can fail (!), and there is no easy way to detect that. - * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, - * we SAVE_EXTRA_REGS here. - */ - SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn -return_from_stub: - addq $8, %rsp - RESTORE_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_rt_sigreturn) - -#ifdef CONFIG_X86_X32_ABI -ENTRY(stub_x32_rt_sigreturn) - SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub -END(stub_x32_rt_sigreturn) -#endif - -/* - * A newly forked process directly context switches into this address. - * - * rdi: prev task we switched from - */ -ENTRY(ret_from_fork) - - LOCK ; btr $TIF_FORK,TI_flags(%r8) - - pushq $0x0002 - popfq # reset kernel eflags - - call schedule_tail # rdi: 'prev' task parameter - - RESTORE_EXTRA_REGS - - testb $3, CS(%rsp) # from kernel_thread? - - /* - * By the time we get here, we have no idea whether our pt_regs, - * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. - * Use IRET code path to return, since it can safely handle - * all of the above. - */ - jnz int_ret_from_sys_call - - /* We came from kernel_thread */ - /* nb: we depend on RESTORE_EXTRA_REGS above */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call -END(ret_from_fork) - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -END(irq_entries_start) - -/* - * Interrupt entry/exit. - * - * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ - -/* 0(%rsp): ~(interrupt number) */ - .macro interrupt func - cld - /* - * Since nothing in interrupt handling code touches r12...r15 members - * of "struct pt_regs", and since interrupts can nest, we can save - * four stack slots and simultaneously provide - * an unwind-friendly stack layout by saving "truncated" pt_regs - * exactly up to rbp slot, without these members. - */ - ALLOC_PT_GPREGS_ON_STACK -RBP - SAVE_C_REGS -RBP - /* this goes to 0(%rsp) for unwinder, not for saving the value: */ - SAVE_EXTRA_REGS_RBP -RBP - - leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - - testb $3, CS-RBP(%rsp) - jz 1f - SWAPGS -1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rsi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rsi - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF - - call \func - .endm - - /* - * The interrupt stubs push (~vector+0x80) onto the stack and - * then jump to common_interrupt. - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - ASM_CLAC - addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ - interrupt do_IRQ - /* 0(%rsp): old RSP */ -ret_from_intr: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - - /* Restore saved previous stack */ - popq %rsi - /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi),%rsp - - testb $3, CS(%rsp) - jz retint_kernel - /* Interrupt came from user space */ -retint_user: - GET_THREAD_INFO(%rcx) - /* - * %rcx: thread info. Interrupts off. - */ -retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi -retint_check: - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz retint_careful - -retint_swapgs: /* return to user-space */ - /* - * The iretq could re-enable interrupts: - */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ - - SWAPGS - jmp restore_c_regs_and_iret - -/* Returning to kernel space */ -retint_kernel: -#ifdef CONFIG_PREEMPT - /* Interrupts are off */ - /* Check if we need preemption */ - bt $9,EFLAGS(%rsp) /* interrupts were off? */ - jnc 1f -0: cmpl $0,PER_CPU_VAR(__preempt_count) - jnz 1f - call preempt_schedule_irq - jmp 0b -1: -#endif - /* - * The iretq could re-enable interrupts: - */ - TRACE_IRQS_IRETQ - -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -restore_c_regs_and_iret: - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - -irq_return: - INTERRUPT_RETURN - -ENTRY(native_iret) - /* - * Are we returning to a stack segment from the LDT? Note: in - * 64-bit mode SS:RSP on the exception stack is always valid. - */ -#ifdef CONFIG_X86_ESPFIX64 - testb $4,(SS-RIP)(%rsp) - jnz native_irq_return_ldt -#endif - -.global native_irq_return_iret -native_irq_return_iret: - /* - * This may fault. Non-paranoid faults on return to userspace are - * handled by fixup_bad_iret. These include #SS, #GP, and #NP. - * Double-faults due to espfix64 are handled in do_double_fault. - * Other faults here are fatal. - */ - iretq - -#ifdef CONFIG_X86_ESPFIX64 -native_irq_return_ldt: - pushq %rax - pushq %rdi - SWAPGS - movq PER_CPU_VAR(espfix_waddr),%rdi - movq %rax,(0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp),%rax /* RIP */ - movq %rax,(1*8)(%rdi) - movq (3*8)(%rsp),%rax /* CS */ - movq %rax,(2*8)(%rdi) - movq (4*8)(%rsp),%rax /* RFLAGS */ - movq %rax,(3*8)(%rdi) - movq (6*8)(%rsp),%rax /* SS */ - movq %rax,(5*8)(%rdi) - movq (5*8)(%rsp),%rax /* RSP */ - movq %rax,(4*8)(%rdi) - andl $0xffff0000,%eax - popq %rdi - orq PER_CPU_VAR(espfix_stack),%rax - SWAPGS - movq %rax,%rsp - popq %rax - jmp native_irq_return_iret -#endif - - /* edi: workmask, edx: work */ -retint_careful: - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - GET_THREAD_INFO(%rcx) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp retint_check - -retint_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz retint_swapgs - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule - -END(common_interrupt) - -/* - * APIC interrupts. - */ -.macro apicinterrupt3 num sym do_sym -ENTRY(\sym) - ASM_CLAC - pushq $~(\num) -.Lcommon_\sym: - interrupt \do_sym - jmp ret_from_intr -END(\sym) -.endm - -#ifdef CONFIG_TRACING -#define trace(sym) trace_##sym -#define smp_trace(sym) smp_trace_##sym - -.macro trace_apicinterrupt num sym -apicinterrupt3 \num trace(\sym) smp_trace(\sym) -.endm -#else -.macro trace_apicinterrupt num sym do_sym -.endm -#endif - -.macro apicinterrupt num sym do_sym -apicinterrupt3 \num \sym \do_sym -trace_apicinterrupt \num \sym -.endm - -#ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ - irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR \ - reboot_interrupt smp_reboot_interrupt -#endif - -#ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE \ - uv_bau_message_intr1 uv_bau_message_interrupt -#endif -apicinterrupt LOCAL_TIMER_VECTOR \ - apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR \ - x86_platform_ipi smp_x86_platform_ipi - -#ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR \ - kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -#endif - -#ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt smp_threshold_interrupt -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR \ - thermal_interrupt smp_thermal_interrupt -#endif - -#ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ - call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR \ - call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR \ - reschedule_interrupt smp_reschedule_interrupt -#endif - -apicinterrupt ERROR_APIC_VECTOR \ - error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR \ - spurious_interrupt smp_spurious_interrupt - -#ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR \ - irq_work_interrupt smp_irq_work_interrupt -#endif - -/* - * Exception entry points. - */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) - -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 -ENTRY(\sym) - /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 - .error "using shift_ist requires paranoid=1" - .endif - - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - - .ifeq \has_error_code - pushq $-1 /* ORIG_RAX: no syscall to restart */ - .endif - - ALLOC_PT_GPREGS_ON_STACK - - .if \paranoid - .if \paranoid == 1 - testb $3, CS(%rsp) /* If coming from userspace, switch */ - jnz 1f /* stacks. */ - .endif - call paranoid_entry - .else - call error_entry - .endif - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - .endif - - movq %rsp,%rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi,%esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) - .endif - - /* these procedures expect "no swapgs" flag in ebx */ - .if \paranoid - jmp paranoid_exit - .else - jmp error_exit - .endif - - .if \paranoid == 1 - /* - * Paranoid entry from userspace. Switch stacks and treat it - * as a normal entry. This means that paranoid handlers - * run in real process context if user_mode(regs). - */ -1: - call error_entry - - - movq %rsp,%rdi /* pt_regs pointer */ - call sync_regs - movq %rax,%rsp /* switch stack */ - - movq %rsp,%rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi,%esi /* no error code */ - .endif - - call \do_sym - - jmp error_exit /* %ebx: no swapgs flag */ - .endif -END(\sym) -.endm - -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* Reload gs selector with exception handling */ - /* edi: new selector */ -ENTRY(native_load_gs_index) - pushfq - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) - SWAPGS -gs_change: - movl %edi,%gs -2: mfence /* workaround */ - SWAPGS - popfq - ret -END(native_load_gs_index) - - _ASM_EXTABLE(gs_change,bad_gs) - .section .fixup,"ax" - /* running with kernelgs */ -bad_gs: - SWAPGS /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous - -/* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp,%rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr),%rsp - push %rbp # backlink for old unwinder - call __do_softirq - leaveq - decl PER_CPU_VAR(irq_count) - ret -END(do_softirq_own_stack) - -#ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 - -/* - * A note on the "critical region" in our callback handler. - * We want to avoid stacking callback handlers due to events occurring - * during handling of the last event. To do this, we keep events disabled - * until we've done all processing. HOWEVER, we must enable events before - * popping the stack frame (can't be done atomically) and so it would still - * be possible to get enough handler activations to overflow the stack. - * Although unlikely, bugs of that kind are hard to track down, so we'd - * like to avoid the possibility. - * So, on entry to the handler we detect whether we interrupted an - * existing activation in its critical region -- if so, we pop the current - * activation and restart the handler using the previous one. - */ -ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) -/* - * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will - * see the correct pointer to the pt_regs - */ - movq %rdi, %rsp # we don't return, adjust the stack frame -11: incl PER_CPU_VAR(irq_count) - movq %rsp,%rbp - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rbp # backlink for old unwinder - call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp error_exit -END(xen_do_hypervisor_callback) - -/* - * Hypervisor uses this for application faults while it executes. - * We get here for two reasons: - * 1. Fault while reloading DS, ES, FS or GS - * 2. Fault while executing IRET - * Category 1 we do not need to fix up as Xen has already reloaded all segment - * registers that could be reloaded and zeroed the others. - * Category 2 we fix up by killing the current process. We cannot use the - * normal Linux return path in this case because if we use the IRET hypercall - * to pop the stack frame we end up in an infinite loop of failsafe callbacks. - * We distinguish between categories by comparing each saved segment register - * with its current contents: any discrepancy means we in category 1. - */ -ENTRY(xen_failsafe_callback) - movl %ds,%ecx - cmpw %cx,0x10(%rsp) - jne 1f - movl %es,%ecx - cmpw %cx,0x18(%rsp) - jne 1f - movl %fs,%ecx - cmpw %cx,0x20(%rsp) - jne 1f - movl %gs,%ecx - cmpw %cx,0x28(%rsp) - jne 1f - /* All segments match their saved values => Category 2 (Bad IRET). */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $0 /* RIP */ - pushq %r11 - pushq %rcx - jmp general_protection -1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $-1 /* orig_ax = -1 => not a system call */ - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS - SAVE_EXTRA_REGS - jmp error_exit -END(xen_failsafe_callback) - -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - xen_hvm_callback_vector xen_evtchn_do_upcall - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - hyperv_callback_vector hyperv_vector_handler -#endif /* CONFIG_HYPERV */ - -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 -#ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 -#endif -idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 -#ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 -#endif -#ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) -#endif - -/* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise - */ -ENTRY(paranoid_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx,%ebx -1: ret -END(paranoid_entry) - -/* - * "Paranoid" exit path from exception stack. This is invoked - * only on return from non-NMI IST interrupts that came - * from kernel space. - * - * We may be returning to very strange contexts (e.g. very early - * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. - */ -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -ENTRY(paranoid_exit) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF_DEBUG - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs - TRACE_IRQS_IRETQ - SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN -END(paranoid_exit) - -/* - * Save all registers in pt_regs, and switch gs if needed. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise - */ -ENTRY(error_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 - xorl %ebx,%ebx - testb $3, CS+8(%rsp) - jz error_kernelspace -error_swapgs: - SWAPGS -error_sti: - TRACE_IRQS_OFF - ret - - /* - * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. B stepping K8s sometimes report a - * truncated RIP for IRET exceptions returning to compat mode. Check - * for these here too. - */ -error_kernelspace: - incl %ebx - leaq native_irq_return_iret(%rip),%rcx - cmpq %rcx,RIP+8(%rsp) - je error_bad_iret - movl %ecx,%eax /* zero extend */ - cmpq %rax,RIP+8(%rsp) - je bstep_iret - cmpq $gs_change,RIP+8(%rsp) - je error_swapgs - jmp error_sti - -bstep_iret: - /* Fix truncated RIP */ - movq %rcx,RIP+8(%rsp) - /* fall through */ - -error_bad_iret: - SWAPGS - mov %rsp,%rdi - call fixup_bad_iret - mov %rax,%rsp - decl %ebx /* Return to usergs */ - jmp error_sti -END(error_entry) - - -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -ENTRY(error_exit) - movl %ebx,%eax - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %eax,%eax - jnz retint_kernel - jmp retint_user -END(error_exit) - -/* Runs on exception stack */ -ENTRY(nmi) - PARAVIRT_ADJUST_EXCEPTION_FRAME - /* - * We allow breakpoints in NMIs. If a breakpoint occurs, then - * the iretq it performs will take us out of NMI context. - * This means that we can have nested NMIs where the next - * NMI is using the top of the stack of the previous NMI. We - * can't let it execute because the nested NMI will corrupt the - * stack of the previous NMI. NMI handlers are not re-entrant - * anyway. - * - * To handle this case we do the following: - * Check the a special location on the stack that contains - * a variable that is set when NMIs are executing. - * The interrupted task's stack is also checked to see if it - * is an NMI stack. - * If the variable is not set and the stack is not the NMI - * stack then: - * o Set the special variable on the stack - * o Copy the interrupt frame into a "saved" location on the stack - * o Copy the interrupt frame into a "copy" location on the stack - * o Continue processing the NMI - * If the variable is set or the previous stack is the NMI stack: - * o Modify the "copy" location to jump to the repeate_nmi - * o return back to the first NMI - * - * Now on exit of the first NMI, we first clear the stack variable - * The NMI stack will tell any nested NMIs at that point that it is - * nested. Then we pop the stack normally with iret, and if there was - * a nested NMI that updated the copy interrupt stack frame, a - * jump will be made to the repeat_nmi code that will handle the second - * NMI. - */ - - /* Use %rdx as our temp variable throughout */ - pushq %rdx - - /* - * If %cs was not the kernel segment, then the NMI triggered in user - * space, which means it is definitely not nested. - */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi - - /* - * Check the special variable on the stack to see if NMIs are - * executing. - */ - cmpl $1, -8(%rsp) - je nested_nmi - - /* - * Now test if the previous stack was an NMI stack. - * We need the double check. We check the NMI stack to satisfy the - * race when the first NMI clears the variable before returning. - * We check the variable because the first NMI could be in a - * breakpoint routine using a breakpoint stack. - */ - lea 6*8(%rsp), %rdx - /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ - cmpq %rdx, 4*8(%rsp) - /* If the stack pointer is above the NMI stack, this is a normal NMI */ - ja first_nmi - subq $EXCEPTION_STKSZ, %rdx - cmpq %rdx, 4*8(%rsp) - /* If it is below the NMI stack, it is a normal NMI */ - jb first_nmi - /* Ah, it is within the NMI stack, treat it as nested */ - -nested_nmi: - /* - * Do nothing if we interrupted the fixup in repeat_nmi. - * It's about to repeat the NMI handler, so we are fine - * with ignoring this one. - */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out - -1: - /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp - leaq -10*8(%rsp), %rdx - pushq $__KERNEL_DS - pushq %rdx - pushfq - pushq $__KERNEL_CS - pushq $repeat_nmi - - /* Put stack back */ - addq $(6*8), %rsp - -nested_nmi_out: - popq %rdx - - /* No need to check faults here */ - INTERRUPT_RETURN - -first_nmi: - /* - * Because nested NMIs will use the pushed location that we - * stored in rdx, we must keep that space available. - * Here's what our stack frame will look like: - * +-------------------------+ - * | original SS | - * | original Return RSP | - * | original RFLAGS | - * | original CS | - * | original RIP | - * +-------------------------+ - * | temp storage for rdx | - * +-------------------------+ - * | NMI executing variable | - * +-------------------------+ - * | copied SS | - * | copied Return RSP | - * | copied RFLAGS | - * | copied CS | - * | copied RIP | - * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ - * | pt_regs | - * +-------------------------+ - * - * The saved stack frame is used to fix up the copied stack frame - * that a nested NMI may change to make the interrupted NMI iret jump - * to the repeat_nmi. The original stack frame and the temp storage - * is also used by nested NMIs and can not be trusted on exit. - */ - /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ - movq (%rsp), %rdx - - /* Set the NMI executing variable on the stack. */ - pushq $1 - - /* - * Leave room for the "copied" frame - */ - subq $(5*8), %rsp - - /* Copy the stack frame to the Saved frame */ - .rept 5 - pushq 11*8(%rsp) - .endr - - /* Everything up to here is safe from nested NMIs */ - - /* - * If there was a nested NMI, the first NMI's iret will return - * here. But NMIs are still enabled and we can take another - * nested NMI. The nested NMI checks the interrupted RIP to see - * if it is between repeat_nmi and end_repeat_nmi, and if so - * it will just return, as we are about to repeat an NMI anyway. - * This makes it safe to copy to the stack frame that a nested - * NMI will update. - */ -repeat_nmi: - /* - * Update the stack variable to say we are still in NMI (the update - * is benign for the non-repeat case, where 1 was pushed just above - * to this very stack slot). - */ - movq $1, 10*8(%rsp) - - /* Make another copy, this one may be modified by nested NMIs */ - addq $(10*8), %rsp - .rept 5 - pushq -6*8(%rsp) - .endr - subq $(5*8), %rsp -end_repeat_nmi: - - /* - * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception and reset our iret stack - * so that we repeat another NMI. - */ - pushq $-1 /* ORIG_RAX: no syscall to restart */ - ALLOC_PT_GPREGS_ON_STACK - - /* - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit - * as we should not be calling schedule in NMI context. - * Even with normal interrupts enabled. An NMI should not be - * setting NEED_RESCHED or anything that normal interrupts and - * exceptions might do. - */ - call paranoid_entry - - /* - * Save off the CR2 register. If we take a page fault in the NMI then - * it could corrupt the CR2 value. If the NMI preempts a page fault - * handler before it was able to read the CR2 register, and then the - * NMI itself takes a page fault, the page fault that was preempted - * will read the information from the NMI page fault and not the - * origin fault. Save it off and restore it if it changes. - * Use the r12 callee-saved register. - */ - movq %cr2, %r12 - - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp,%rdi - movq $-1,%rsi - call do_nmi - - /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 -1: - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore -nmi_swapgs: - SWAPGS_UNSAFE_STACK -nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - /* Pop the extra iret frame at once */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 - - /* Clear the NMI executing stack variable */ - movq $0, 5*8(%rsp) - jmp irq_return -END(nmi) - -ENTRY(ignore_sysret) - mov $-ENOSYS,%eax - sysret -END(ignore_sysret) - -- cgit v1.2.1 From 19a433f45135656d065bdf1bbe543796b194ba3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:00:59 +0200 Subject: x86/asm/entry: Move the compat syscall entry code to arch/x86/entry/ Move the ia32entry.S file over into arch/x86/entry/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 5 +- arch/x86/entry/ia32entry.S | 522 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/ia32/Makefile | 2 +- arch/x86/ia32/ia32entry.S | 522 --------------------------------------------- 4 files changed, 527 insertions(+), 524 deletions(-) create mode 100644 arch/x86/entry/ia32entry.S delete mode 100644 arch/x86/ia32/ia32entry.S (limited to 'arch/x86') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index fa7e0cf6d3c4..4a626594fa79 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,4 +1,7 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o +obj-y := entry_$(BITS).o + +obj-$(CONFIG_IA32_EMULATION) += ia32entry.o + diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S new file mode 100644 index 000000000000..2be23c734db5 --- /dev/null +++ b/arch/x86/entry/ia32entry.S @@ -0,0 +1,522 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysexit_audit ia32_ret_from_sys_call +#define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + + /* clobbers %rax */ + .macro CLEAR_RREGS _r9=rax + xorl %eax,%eax + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %\_r9,R9(%rsp) + movq %rax,R8(%rsp) + .endm + + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + .macro LOAD_ARGS32 _r9=0 + .if \_r9 + movl R9(%rsp),%r9d + .endif + movl RCX(%rsp),%ecx + movl RDX(%rsp),%edx + movl RSI(%rsp),%esi + movl RDI(%rsp),%edi + movl %eax,%eax /* zero extension */ + .endm + + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) +#endif + +/* + * 32bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + /* + * no need to do an access_ok check here because rbp has been + * 32bit zero extended + */ + ASM_STAC +1: movl (%rbp),%ebp + _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT,EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysenter_tracesys + +sysenter_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ +sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp),%ecx /* User %eip */ + RESTORE_RSI_RDI + xorl %edx,%edx /* avoid info leaks */ + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + movl EFLAGS(%rsp),%r11d /* User eflags */ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp),%esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi,%r8d /* 5th arg: 4th syscall arg */ + movl %ecx,%r9d /*swap with edx*/ + movl %edx,%ecx /* 4th arg: 3rd syscall arg */ + movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ + movl %ebx,%esi /* 2nd arg: 1st syscall arg */ + movl %eax,%edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl ORIG_RAX(%rsp),%eax /* reload syscall number */ + movl %ebx,%edi /* reload 1st syscall arg */ + movl RCX(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI(%rsp),%r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax,%esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp),%rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + CLEAR_RREGS + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp,%r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + CLEAR_RREGS + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + jmp sysenter_do_call +ENDPROC(ia32_sysenter_target) + +/* + * 32bit SYSCALL instruction entry. + * + * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp,%r8d + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ + movl %ebp,%ecx + pushq $-ENOSYS /* pt_regs->ax */ + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + /* + * no need to do an access_ok check here because r8 has been + * 32bit zero extended + */ + ASM_STAC +1: movl (%r8),%r9d + _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys + +cstar_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + /* r9 already loaded */ /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ +cstar_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + RESTORE_RSI_RDI_RDX + movl RIP(%rsp),%ecx + movl EFLAGS(%rsp),%r11d + xorq %r10,%r10 + xorq %r9,%r9 + xorq %r8,%r8 + TRACE_IRQS_ON + movl RSP(%rsp),%esp + /* + * 64bit->32bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32bit->32bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + movl %r9d,R9(%rsp) /* register to be clobbered by call */ + auditsys_entry_common + movl R9(%rsp),%r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + xchgl %r9d,%ebp + SAVE_EXTRA_REGS + CLEAR_RREGS r9 + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + xchgl %ebp,%r9d + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT,%rax + jmp ia32_sysret + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(ia32_syscall) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys +ia32_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative +ia32_sysret: + movq %rax,RAX(%rsp) +1: +ia32_ret_from_sys_call: + CLEAR_RREGS + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + CLEAR_RREGS + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + jmp ia32_do_call +END(ia32_syscall) + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip),%rax + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip),%rax + mov %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret +END(ia32_ptregs_common) diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index bb635c641869..cd4339bae066 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -2,7 +2,7 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o +obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S deleted file mode 100644 index 2be23c734db5..000000000000 --- a/arch/x86/ia32/ia32entry.S +++ /dev/null @@ -1,522 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit ia32_ret_from_sys_call -#define sysretl_audit ia32_ret_from_sys_call -#endif - - .section .entry.text, "ax" - - /* clobbers %rax */ - .macro CLEAR_RREGS _r9=rax - xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) - .endm - - /* - * Reload arg registers from stack in case ptrace changed them. - * We don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. Instead, we just truncate that - * value to 32 bits again as we did on entry from user mode. - * If it's a new value set by user_regset during entry tracing, - * this matches the normal truncation of the user-mode value. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - .macro LOAD_ARGS32 _r9=0 - .if \_r9 - movl R9(%rsp),%r9d - .endif - movl RCX(%rsp),%ecx - movl RDX(%rsp),%edx - movl RSI(%rsp),%esi - movl RDI(%rsp),%edi - movl %eax,%eax /* zero extension */ - .endm - - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) -#endif - -/* - * 32bit SYSENTER instruction entry. - * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). - * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp user stack - * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp - movl %eax, %eax - - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp */ - pushfq /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - - /* - * no need to do an access_ok check here because rbp has been - * 32bit zero extended - */ - ASM_STAC -1: movl (%rbp),%ebp - _ASM_EXTABLE(1b,ia32_badarg) - ASM_CLAC - - /* - * Sysenter doesn't filter flags, so we need to clear NT - * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. - */ - testl $X86_EFLAGS_NT,EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys - -sysenter_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp),%ecx /* User %eip */ - RESTORE_RSI_RDI - xorl %edx,%edx /* avoid info leaks */ - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - movl EFLAGS(%rsp),%r11d /* User eflags */ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp),%esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - movl %esi,%r8d /* 5th arg: 4th syscall arg */ - movl %ecx,%r9d /*swap with edx*/ - movl %edx,%ecx /* 4th arg: 3rd syscall arg */ - movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ - movl %ebx,%esi /* 2nd arg: 1st syscall arg */ - movl %eax,%edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl ORIG_RAX(%rsp),%eax /* reload syscall number */ - movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI(%rsp),%r8d /* reload 5th syscall arg */ - .endm - - .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp),%rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - CLEAR_RREGS - jmp int_with_check - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp,%r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - -sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq - jmp sysenter_flags_fixed - -sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif - SAVE_EXTRA_REGS - CLEAR_RREGS - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - jmp sysenter_do_call -ENDPROC(ia32_sysenter_target) - -/* - * 32bit SYSCALL instruction entry. - * - * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit - * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes - * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). - * - * Arguments: - * eax system call number - * ecx return address - * ebx arg1 - * ebp arg2 (note: not saved in the stack frame, should not be touched) - * edx arg3 - * esi arg4 - * edi arg5 - * esp user stack - * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movl %esp,%r8d - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %r8 /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rbp /* pt_regs->cx */ - movl %ebp,%ecx - pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - - /* - * no need to do an access_ok check here because r8 has been - * 32bit zero extended - */ - ASM_STAC -1: movl (%r8),%r9d - _ASM_EXTABLE(1b,ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys - -cstar_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - RESTORE_RSI_RDI_RDX - movl RIP(%rsp),%ecx - movl EFLAGS(%rsp),%r11d - xorq %r10,%r10 - xorq %r9,%r9 - xorq %r8,%r8 - TRACE_IRQS_ON - movl RSP(%rsp),%esp - /* - * 64bit->32bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32bit->32bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - movl %r9d,R9(%rsp) /* register to be clobbered by call */ - auditsys_entry_common - movl R9(%rsp),%r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - -cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif - xchgl %r9d,%ebp - SAVE_EXTRA_REGS - CLEAR_RREGS r9 - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - xchgl %ebp,%r9d - jmp cstar_do_call -END(ia32_cstar_target) - -ia32_badarg: - ASM_CLAC - movq $-EFAULT,%rax - jmp ia32_sysret - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax - - /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys -ia32_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) # xxx: rip relative -ia32_sysret: - movq %rax,RAX(%rsp) -1: -ia32_ret_from_sys_call: - CLEAR_RREGS - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_EXTRA_REGS - CLEAR_RREGS - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - jmp ia32_do_call -END(ia32_syscall) - - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip),%rax - jmp ia32_ptregs_common - .endm - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - - ALIGN -GLOBAL(stub32_clone) - leaq sys_clone(%rip),%rax - mov %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret -END(ia32_ptregs_common) -- cgit v1.2.1 From d603c8e184d882cc3f55c599f4185bad956ee0a7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:05:44 +0200 Subject: x86/asm/entry, x86/vdso: Move the vDSO code to arch/x86/entry/vdso/ Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kbuild | 2 +- arch/x86/Makefile | 2 +- arch/x86/entry/Makefile | 2 + arch/x86/entry/vdso/.gitignore | 7 + arch/x86/entry/vdso/Makefile | 209 +++++++++++++++ arch/x86/entry/vdso/checkundef.sh | 10 + arch/x86/entry/vdso/vclock_gettime.c | 351 +++++++++++++++++++++++++ arch/x86/entry/vdso/vdso-layout.lds.S | 118 +++++++++ arch/x86/entry/vdso/vdso-note.S | 12 + arch/x86/entry/vdso/vdso.lds.S | 29 ++ arch/x86/entry/vdso/vdso2c.c | 253 ++++++++++++++++++ arch/x86/entry/vdso/vdso2c.h | 175 ++++++++++++ arch/x86/entry/vdso/vdso32-setup.c | 120 +++++++++ arch/x86/entry/vdso/vdso32/.gitignore | 1 + arch/x86/entry/vdso/vdso32/int80.S | 56 ++++ arch/x86/entry/vdso/vdso32/note.S | 44 ++++ arch/x86/entry/vdso/vdso32/sigreturn.S | 145 ++++++++++ arch/x86/entry/vdso/vdso32/syscall.S | 75 ++++++ arch/x86/entry/vdso/vdso32/sysenter.S | 116 ++++++++ arch/x86/entry/vdso/vdso32/vclock_gettime.c | 30 +++ arch/x86/entry/vdso/vdso32/vdso-fakesections.c | 1 + arch/x86/entry/vdso/vdso32/vdso32.lds.S | 37 +++ arch/x86/entry/vdso/vdsox32.lds.S | 25 ++ arch/x86/entry/vdso/vgetcpu.c | 28 ++ arch/x86/entry/vdso/vma.c | 300 +++++++++++++++++++++ arch/x86/vdso/.gitignore | 7 - arch/x86/vdso/Makefile | 209 --------------- arch/x86/vdso/checkundef.sh | 10 - arch/x86/vdso/vclock_gettime.c | 351 ------------------------- arch/x86/vdso/vdso-layout.lds.S | 118 --------- arch/x86/vdso/vdso-note.S | 12 - arch/x86/vdso/vdso.lds.S | 29 -- arch/x86/vdso/vdso2c.c | 253 ------------------ arch/x86/vdso/vdso2c.h | 175 ------------ arch/x86/vdso/vdso32-setup.c | 120 --------- arch/x86/vdso/vdso32/.gitignore | 1 - arch/x86/vdso/vdso32/int80.S | 56 ---- arch/x86/vdso/vdso32/note.S | 44 ---- arch/x86/vdso/vdso32/sigreturn.S | 145 ---------- arch/x86/vdso/vdso32/syscall.S | 75 ------ arch/x86/vdso/vdso32/sysenter.S | 116 -------- arch/x86/vdso/vdso32/vclock_gettime.c | 30 --- arch/x86/vdso/vdso32/vdso-fakesections.c | 1 - arch/x86/vdso/vdso32/vdso32.lds.S | 37 --- arch/x86/vdso/vdsox32.lds.S | 25 -- arch/x86/vdso/vgetcpu.c | 28 -- arch/x86/vdso/vma.c | 300 --------------------- 47 files changed, 2146 insertions(+), 2144 deletions(-) create mode 100644 arch/x86/entry/vdso/.gitignore create mode 100644 arch/x86/entry/vdso/Makefile create mode 100755 arch/x86/entry/vdso/checkundef.sh create mode 100644 arch/x86/entry/vdso/vclock_gettime.c create mode 100644 arch/x86/entry/vdso/vdso-layout.lds.S create mode 100644 arch/x86/entry/vdso/vdso-note.S create mode 100644 arch/x86/entry/vdso/vdso.lds.S create mode 100644 arch/x86/entry/vdso/vdso2c.c create mode 100644 arch/x86/entry/vdso/vdso2c.h create mode 100644 arch/x86/entry/vdso/vdso32-setup.c create mode 100644 arch/x86/entry/vdso/vdso32/.gitignore create mode 100644 arch/x86/entry/vdso/vdso32/int80.S create mode 100644 arch/x86/entry/vdso/vdso32/note.S create mode 100644 arch/x86/entry/vdso/vdso32/sigreturn.S create mode 100644 arch/x86/entry/vdso/vdso32/syscall.S create mode 100644 arch/x86/entry/vdso/vdso32/sysenter.S create mode 100644 arch/x86/entry/vdso/vdso32/vclock_gettime.c create mode 100644 arch/x86/entry/vdso/vdso32/vdso-fakesections.c create mode 100644 arch/x86/entry/vdso/vdso32/vdso32.lds.S create mode 100644 arch/x86/entry/vdso/vdsox32.lds.S create mode 100644 arch/x86/entry/vdso/vgetcpu.c create mode 100644 arch/x86/entry/vdso/vma.c delete mode 100644 arch/x86/vdso/.gitignore delete mode 100644 arch/x86/vdso/Makefile delete mode 100755 arch/x86/vdso/checkundef.sh delete mode 100644 arch/x86/vdso/vclock_gettime.c delete mode 100644 arch/x86/vdso/vdso-layout.lds.S delete mode 100644 arch/x86/vdso/vdso-note.S delete mode 100644 arch/x86/vdso/vdso.lds.S delete mode 100644 arch/x86/vdso/vdso2c.c delete mode 100644 arch/x86/vdso/vdso2c.h delete mode 100644 arch/x86/vdso/vdso32-setup.c delete mode 100644 arch/x86/vdso/vdso32/.gitignore delete mode 100644 arch/x86/vdso/vdso32/int80.S delete mode 100644 arch/x86/vdso/vdso32/note.S delete mode 100644 arch/x86/vdso/vdso32/sigreturn.S delete mode 100644 arch/x86/vdso/vdso32/syscall.S delete mode 100644 arch/x86/vdso/vdso32/sysenter.S delete mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c delete mode 100644 arch/x86/vdso/vdso32/vdso-fakesections.c delete mode 100644 arch/x86/vdso/vdso32/vdso32.lds.S delete mode 100644 arch/x86/vdso/vdsox32.lds.S delete mode 100644 arch/x86/vdso/vgetcpu.c delete mode 100644 arch/x86/vdso/vma.c (limited to 'arch/x86') diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index b9b816277e72..1538562cc720 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -14,7 +14,7 @@ obj-y += kernel/ obj-y += mm/ obj-y += crypto/ -obj-y += vdso/ + obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 43e8328a23e4..90b1c3bfec46 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -244,7 +244,7 @@ install: PHONY += vdso_install vdso_install: - $(Q)$(MAKE) $(build)=arch/x86/vdso $@ + $(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@ archclean: $(Q)rm -rf $(objtree)/arch/i386 diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 4a626594fa79..c97532492ef5 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -3,5 +3,7 @@ # obj-y := entry_$(BITS).o +obj-y += vdso/ + obj-$(CONFIG_IA32_EMULATION) += ia32entry.o diff --git a/arch/x86/entry/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore new file mode 100644 index 000000000000..aae8ffdd5880 --- /dev/null +++ b/arch/x86/entry/vdso/.gitignore @@ -0,0 +1,7 @@ +vdso.lds +vdsox32.lds +vdso32-syscall-syms.lds +vdso32-sysenter-syms.lds +vdso32-int80-syms.lds +vdso-image-*.c +vdso2c diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile new file mode 100644 index 000000000000..e97032069f88 --- /dev/null +++ b/arch/x86/entry/vdso/Makefile @@ -0,0 +1,209 @@ +# +# Building vDSO images for x86. +# + +KBUILD_CFLAGS += $(DISABLE_LTO) +KASAN_SANITIZE := n + +VDSO64-$(CONFIG_X86_64) := y +VDSOX32-$(CONFIG_X86_X32_ABI) := y +VDSO32-$(CONFIG_X86_32) := y +VDSO32-$(CONFIG_COMPAT) := y + +# files to link into the vdso +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o + +# files to link into kernel +obj-y += vma.o + +# vDSO images to build +vdso_img-$(VDSO64-y) += 64 +vdso_img-$(VDSOX32-y) += x32 +vdso_img-$(VDSO32-y) += 32-int80 +vdso_img-$(CONFIG_COMPAT) += 32-syscall +vdso_img-$(VDSO32-y) += 32-sysenter + +obj-$(VDSO32-y) += vdso32-setup.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.lds $(vobjs-y) + +# Build the vDSO image C files and link them in. +vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) +vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) +vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) +obj-y += $(vdso_img_objs) +targets += $(vdso_img_cfiles) +targets += $(vdso_img_sodbg) +.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ + $(vdso_img-y:%=$(obj)/vdso%.so) + +export CPPFLAGS_vdso.lds += -P -C + +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,--no-undefined \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ + $(DISABLE_LTO) + +$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,vdso) + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi +hostprogs-y += vdso2c + +quiet_cmd_vdso2c = VDSO2C $@ +define cmd_vdso2c + $(obj)/vdso2c $< $(<:%.dbg=%) $@ +endef + +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE + $(call if_changed,vdso2c) + +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# +CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ + $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -DDISABLE_BRANCH_PROFILING + +$(vobjs): KBUILD_CFLAGS += $(CFL) + +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg +CFLAGS_REMOVE_vclock_gettime.o = -pg +CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vvar.o = -pg + +# +# X32 processes use x32 vDSO to access 64bit kernel data. +# +# Build x32 vDSO image: +# 1. Compile x32 vDSO as 64bit. +# 2. Convert object files to x32. +# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes +# so that it can reach 64bit address space with 64bit pointers. +# + +CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ + -Wl,-soname=linux-vdso.so.1 \ + -Wl,-z,max-page-size=4096 \ + -Wl,-z,common-page-size=4096 + +# 64-bit objects to re-brand as x32 +vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y)) + +# x32-rebranded versions +vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o) + +# same thing, but in the output directory +vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) + +# Convert 64bit object file to x32 for x32 vDSO. +quiet_cmd_x32 = X32 $@ + cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@ + +$(obj)/%-x32.o: $(obj)/%.o FORCE + $(call if_changed,x32) + +targets += vdsox32.lds $(vobjx32s-y) + +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg + $(call if_changed,objcopy) + +$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE + $(call if_changed,vdso) + +# +# Build multiple 32-bit vDSO images to choose from at boot time. +# +vdso32.so-$(VDSO32-y) += int80 +vdso32.so-$(CONFIG_COMPAT) += syscall +vdso32.so-$(VDSO32-y) += sysenter + +vdso32-images = $(vdso32.so-y:%=vdso32-%.so) + +CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 + +# This makes sure the $(obj) subdirectory exists even though vdso32/ +# is not a kbuild sub-make subdirectory. +override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ + +targets += vdso32/vdso32.lds +targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) +targets += vdso32/vclock_gettime.o + +$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic +KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) +KBUILD_CFLAGS_32 += -fno-omit-frame-pointer +KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + +$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/%.o + $(call if_changed,vdso) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(CC) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' + +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ + $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) +GCOV_PROFILE := n + +# +# Install the unstripped copies of vdso*.so. If our toolchain supports +# build-id, install .build-id links as well. +# +quiet_cmd_vdso_install = INSTALL $(@:install_%=%) +define cmd_vdso_install + cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ + if readelf -n $< |grep -q 'Build ID'; then \ + buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ + first=`echo $$buildid | cut -b-2`; \ + last=`echo $$buildid | cut -b3-`; \ + mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ + ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ + fi +endef + +vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) + +$(MODLIB)/vdso: FORCE + @mkdir -p $(MODLIB)/vdso + +$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE + $(call cmd,vdso_install) + +PHONY += vdso_install $(vdso_img_insttargets) +vdso_install: $(vdso_img_insttargets) FORCE + +clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh new file mode 100755 index 000000000000..7ee90a9b549d --- /dev/null +++ b/arch/x86/entry/vdso/checkundef.sh @@ -0,0 +1,10 @@ +#!/bin/sh +nm="$1" +file="$2" +$nm "$file" | grep '^ *U' > /dev/null 2>&1 +if [ $? -eq 1 ]; then + exit 0 +else + echo "$file: undefined symbols found" >&2 + exit 1 +fi diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c new file mode 100644 index 000000000000..9793322751e0 --- /dev/null +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -0,0 +1,351 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of clock_gettime, gettimeofday, and time. + * + * 32 Bit compat layer by Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * The code should have no internal unresolved relocations. + * Check with readelf after changing. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define gtod (&VVAR(vsyscall_gtod_data)) + +extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); +extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +extern time_t __vdso_time(time_t *t); + +#ifdef CONFIG_HPET_TIMER +extern u8 hpet_page + __attribute__((visibility("hidden"))); + +static notrace cycle_t vread_hpet(void) +{ + return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); +} +#endif + +#ifndef BUILD_VDSO32 + +#include +#include +#include +#include + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} + +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) +{ + const struct pvclock_vsyscall_time_info *pvti_base; + int idx = cpu / (PAGE_SIZE/PVTI_SIZE); + int offset = cpu % (PAGE_SIZE/PVTI_SIZE); + + BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); + + pvti_base = (struct pvclock_vsyscall_time_info *) + __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); + + return &pvti_base[offset]; +} + +static notrace cycle_t vread_pvclock(int *mode) +{ + const struct pvclock_vsyscall_time_info *pvti; + cycle_t ret; + u64 last; + u32 version; + u8 flags; + unsigned cpu, cpu1; + + + /* + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * + */ + do { + cpu = __getcpu() & VGETCPU_CPU_MASK; + /* TODO: We can put vcpu id into higher bits of pvti.version. + * This will save a couple of cycles by getting rid of + * __getcpu() calls (Gleb). + */ + + pvti = get_pvti(cpu); + + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* + * Test we're still on the cpu as well as the version. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. + */ + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version)); + + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + *mode = VCLOCK_NONE; + + /* refer to tsc.c read_tsc() comment for rationale */ + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} +#endif + +#else + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "memory", "edx"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "memory", "edx"); + return ret; +} + +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace cycle_t vread_pvclock(int *mode) +{ + *mode = VCLOCK_NONE; + return 0; +} +#endif + +#endif + +notrace static cycle_t vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)__native_read_tsc(); + + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +notrace static inline u64 vgetsns(int *mode) +{ + u64 v; + cycles_t cycles; + + if (gtod->vclock_mode == VCLOCK_TSC) + cycles = vread_tsc(); +#ifdef CONFIG_HPET_TIMER + else if (gtod->vclock_mode == VCLOCK_HPET) + cycles = vread_hpet(); +#endif +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->vclock_mode == VCLOCK_PVCLOCK) + cycles = vread_pvclock(mode); +#endif + else + return 0; + v = (cycles - gtod->cycle_last) & gtod->mask; + return v * gtod->mult; +} + +/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ +notrace static int __always_inline do_realtime(struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->wall_time_snsec; + ns += vgetsns(&mode); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + +notrace static int __always_inline do_monotonic(struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgetsns(&mode); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + +notrace static void do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->wall_time_coarse_sec; + ts->tv_nsec = gtod->wall_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); +} + +notrace static void do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->monotonic_time_coarse_sec; + ts->tv_nsec = gtod->monotonic_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); +} + +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + switch (clock) { + case CLOCK_REALTIME: + if (do_realtime(ts) == VCLOCK_NONE) + goto fallback; + break; + case CLOCK_MONOTONIC: + if (do_monotonic(ts) == VCLOCK_NONE) + goto fallback; + break; + case CLOCK_REALTIME_COARSE: + do_realtime_coarse(ts); + break; + case CLOCK_MONOTONIC_COARSE: + do_monotonic_coarse(ts); + break; + default: + goto fallback; + } + + return 0; +fallback: + return vdso_fallback_gettime(clock, ts); +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + if (likely(tv != NULL)) { + if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) + return vdso_fallback_gtod(tv, tz); + tv->tv_usec /= 1000; + } + if (unlikely(tz != NULL)) { + tz->tz_minuteswest = gtod->tz_minuteswest; + tz->tz_dsttime = gtod->tz_dsttime; + } + + return 0; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); + +/* + * This will break when the xtime seconds get inaccurate, but that is + * unlikely + */ +notrace time_t __vdso_time(time_t *t) +{ + /* This is atomic on x86 so we don't need any locks. */ + time_t result = ACCESS_ONCE(gtod->wall_time_sec); + + if (t) + *t = result; + return result; +} +int time(time_t *t) + __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S new file mode 100644 index 000000000000..de2c921025f5 --- /dev/null +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -0,0 +1,118 @@ +#include + +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +#if defined(BUILD_VDSO64) +# define SHDR_SIZE 64 +#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32) +# define SHDR_SIZE 40 +#else +# error unknown VDSO target +#endif + +#define NUM_FAKE_SHDRS 13 + +SECTIONS +{ + /* + * User/kernel shared data is before the vDSO. This may be a little + * uglier than putting it after the vDSO, but it avoids issues with + * non-allocatable things that dangle past the end of the PT_LOAD + * segment. + */ + + vvar_start = . - 2 * PAGE_SIZE; + vvar_page = vvar_start; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + hpet_page = vvar_start + PAGE_SIZE; + + . = SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { + *(.rodata*) + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + + /* + * Ideally this would live in a C file, but that won't + * work cleanly for x32 until we start building the x32 + * C code using an x32 toolchain. + */ + VDSO_FAKE_SECTION_TABLE_START = .; + . = . + NUM_FAKE_SHDRS * SHDR_SIZE; + VDSO_FAKE_SECTION_TABLE_END = .; + } :text + + .fake_shstrtab : { *(.fake_shstrtab) } :text + + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + + /* + * Text is well-separated from actual data: there's plenty of + * stuff that isn't used at runtime in between. + */ + + .text : { *(.text*) } :text =0x90909090, + + /* + * At the end so that eu-elflint stays happy when vdso2c strips + * these. A better implementation would avoid allocating space + * for these. + */ + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + /DISCARD/ : { + *(.discard) + *(.discard.*) + *(__bug_table) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S new file mode 100644 index 000000000000..79a071e4357e --- /dev/null +++ b/arch/x86/entry/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S new file mode 100644 index 000000000000..6807932643c2 --- /dev/null +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -0,0 +1,29 @@ +/* + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSO64 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + time; + __vdso_time; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c new file mode 100644 index 000000000000..8627db24a7f6 --- /dev/null +++ b/arch/x86/entry/vdso/vdso2c.c @@ -0,0 +1,253 @@ +/* + * vdso2c - A vdso image preparation tool + * Copyright (c) 2014 Andy Lutomirski and others + * Licensed under the GPL v2 + * + * vdso2c requires stripped and unstripped input. It would be trivial + * to fully strip the input in here, but, for reasons described below, + * we need to write a section table. Doing this is more or less + * equivalent to dropping all non-allocatable sections, but it's + * easier to let objcopy handle that instead of doing it ourselves. + * If we ever need to do something fancier than what objcopy provides, + * it would be straightforward to add here. + * + * We're keep a section table for a few reasons: + * + * The Go runtime had a couple of bugs: it would read the section + * table to try to figure out how many dynamic symbols there were (it + * shouldn't have looked at the section table at all) and, if there + * were no SHT_SYNDYM section table entry, it would use an + * uninitialized value for the number of symbols. An empty DYNSYM + * table would work, but I see no reason not to write a valid one (and + * keep full performance for old Go programs). This hack is only + * needed on x86_64. + * + * The bug was introduced on 2012-08-31 by: + * https://code.google.com/p/go/source/detail?r=56ea40aac72b + * and was fixed on 2014-06-13 by: + * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +const char *outfilename; + +/* Symbols that we need in vdso2c. */ +enum { + sym_vvar_start, + sym_vvar_page, + sym_hpet_page, + sym_VDSO_FAKE_SECTION_TABLE_START, + sym_VDSO_FAKE_SECTION_TABLE_END, +}; + +const int special_pages[] = { + sym_vvar_page, + sym_hpet_page, +}; + +struct vdso_sym { + const char *name; + bool export; +}; + +struct vdso_sym required_syms[] = { + [sym_vvar_start] = {"vvar_start", true}, + [sym_vvar_page] = {"vvar_page", true}, + [sym_hpet_page] = {"hpet_page", true}, + [sym_VDSO_FAKE_SECTION_TABLE_START] = { + "VDSO_FAKE_SECTION_TABLE_START", false + }, + [sym_VDSO_FAKE_SECTION_TABLE_END] = { + "VDSO_FAKE_SECTION_TABLE_END", false + }, + {"VDSO32_NOTE_MASK", true}, + {"VDSO32_SYSENTER_RETURN", true}, + {"__kernel_vsyscall", true}, + {"__kernel_sigreturn", true}, + {"__kernel_rt_sigreturn", true}, +}; + +__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) +static void fail(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + fprintf(stderr, "Error: "); + vfprintf(stderr, format, ap); + if (outfilename) + unlink(outfilename); + exit(1); + va_end(ap); +} + +/* + * Evil macros for little-endian reads and writes + */ +#define GLE(x, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot) + +extern void bad_get_le(void); +#define LAST_GLE(x) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le()) + +#define GET_LE(x) \ + GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x)))) + +#define PLE(x, val, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + put_unaligned_le##bits((val), (x)), ifnot) + +extern void bad_put_le(void); +#define LAST_PLE(x, val) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le()) + +#define PUT_LE(x, val) \ + PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val)))) + + +#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) + +#define BITSFUNC3(name, bits, suffix) name##bits##suffix +#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) + +#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) + +#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x +#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) +#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) + +#define ELF_BITS 64 +#include "vdso2c.h" +#undef ELF_BITS + +#define ELF_BITS 32 +#include "vdso2c.h" +#undef ELF_BITS + +static void go(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; + + if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { + go64(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { + go32(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else { + fail("unknown ELF class\n"); + } +} + +static void map_input(const char *name, void **addr, size_t *len, int prot) +{ + off_t tmp_len; + + int fd = open(name, O_RDONLY); + if (fd == -1) + err(1, "%s", name); + + tmp_len = lseek(fd, 0, SEEK_END); + if (tmp_len == (off_t)-1) + err(1, "lseek"); + *len = (size_t)tmp_len; + + *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); + if (*addr == MAP_FAILED) + err(1, "mmap"); + + close(fd); +} + +int main(int argc, char **argv) +{ + size_t raw_len, stripped_len; + void *raw_addr, *stripped_addr; + FILE *outfile; + char *name, *tmp; + int namelen; + + if (argc != 4) { + printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); + return 1; + } + + /* + * Figure out the struct name. If we're writing to a .so file, + * generate raw output insted. + */ + name = strdup(argv[3]); + namelen = strlen(name); + if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { + name = NULL; + } else { + tmp = strrchr(name, '/'); + if (tmp) + name = tmp + 1; + tmp = strchr(name, '.'); + if (tmp) + *tmp = '\0'; + for (tmp = name; *tmp; tmp++) + if (*tmp == '-') + *tmp = '_'; + } + + map_input(argv[1], &raw_addr, &raw_len, PROT_READ); + map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); + + outfilename = argv[3]; + outfile = fopen(outfilename, "w"); + if (!outfile) + err(1, "%s", argv[2]); + + go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); + + munmap(raw_addr, raw_len); + munmap(stripped_addr, stripped_len); + fclose(outfile); + + return 0; +} diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h new file mode 100644 index 000000000000..0224987556ce --- /dev/null +++ b/arch/x86/entry/vdso/vdso2c.h @@ -0,0 +1,175 @@ +/* + * This file is included twice from vdso2c.c. It generates code for 32-bit + * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs + * are built for 32-bit userspace. + */ + +static void BITSFUNC(go)(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + int found_load = 0; + unsigned long load_size = -1; /* Work around bogus warning */ + unsigned long mapping_size; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; + int i; + unsigned long j; + ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, + *alt_sec = NULL; + ELF(Dyn) *dyn = 0, *dyn_end = 0; + const char *secstrings; + INT_BITS syms[NSYMS] = {}; + + ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); + + /* Walk the segment table. */ + for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { + if (GET_LE(&pt[i].p_type) == PT_LOAD) { + if (found_load) + fail("multiple PT_LOAD segs\n"); + + if (GET_LE(&pt[i].p_offset) != 0 || + GET_LE(&pt[i].p_vaddr) != 0) + fail("PT_LOAD in wrong place\n"); + + if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz)) + fail("cannot handle memsz != filesz\n"); + + load_size = GET_LE(&pt[i].p_memsz); + found_load = 1; + } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) { + dyn = raw_addr + GET_LE(&pt[i].p_offset); + dyn_end = raw_addr + GET_LE(&pt[i].p_offset) + + GET_LE(&pt[i].p_memsz); + } + } + if (!found_load) + fail("no PT_LOAD seg\n"); + + if (stripped_len < load_size) + fail("stripped input is too short\n"); + + /* Walk the dynamic table */ + for (i = 0; dyn + i < dyn_end && + GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { + typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag); + if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA || + tag == DT_RELENT || tag == DT_TEXTREL) + fail("vdso image contains dynamic relocations\n"); + } + + /* Walk the section table */ + secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); + secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset); + for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { + ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * i; + if (GET_LE(&sh->sh_type) == SHT_SYMTAB) + symtab_hdr = sh; + + if (!strcmp(secstrings + GET_LE(&sh->sh_name), + ".altinstructions")) + alt_sec = sh; + } + + if (!symtab_hdr) + fail("no symbol table\n"); + + strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); + + /* Walk the symbol table */ + for (i = 0; + i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); + i++) { + int k; + ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + + GET_LE(&symtab_hdr->sh_entsize) * i; + const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + + GET_LE(&sym->st_name); + + for (k = 0; k < NSYMS; k++) { + if (!strcmp(name, required_syms[k].name)) { + if (syms[k]) { + fail("duplicate symbol %s\n", + required_syms[k].name); + } + + /* + * Careful: we use negative addresses, but + * st_value is unsigned, so we rely + * on syms[k] being a signed type of the + * correct width. + */ + syms[k] = GET_LE(&sym->st_value); + } + } + } + + /* Validate mapping addresses. */ + for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { + INT_BITS symval = syms[special_pages[i]]; + + if (!symval) + continue; /* The mapping isn't used; ignore it. */ + + if (symval % 4096) + fail("%s must be a multiple of 4096\n", + required_syms[i].name); + if (symval + 4096 < syms[sym_vvar_start]) + fail("%s underruns vvar_start\n", + required_syms[i].name); + if (symval + 4096 > 0) + fail("%s is on the wrong side of the vdso text\n", + required_syms[i].name); + } + if (syms[sym_vvar_start] % 4096) + fail("vvar_begin must be a multiple of 4096\n"); + + if (!name) { + fwrite(stripped_addr, stripped_len, 1, outfile); + return; + } + + mapping_size = (stripped_len + 4095) / 4096 * 4096; + + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "\n"); + fprintf(outfile, + "static unsigned char raw_data[%lu] __page_aligned_data = {", + mapping_size); + for (j = 0; j < stripped_len; j++) { + if (j % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", + (int)((unsigned char *)stripped_addr)[j]); + } + fprintf(outfile, "\n};\n\n"); + + fprintf(outfile, "static struct page *pages[%lu];\n\n", + mapping_size / 4096); + + fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "\t.data = raw_data,\n"); + fprintf(outfile, "\t.size = %lu,\n", mapping_size); + fprintf(outfile, "\t.text_mapping = {\n"); + fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); + fprintf(outfile, "\t\t.pages = pages,\n"); + fprintf(outfile, "\t},\n"); + if (alt_sec) { + fprintf(outfile, "\t.alt = %lu,\n", + (unsigned long)GET_LE(&alt_sec->sh_offset)); + fprintf(outfile, "\t.alt_len = %lu,\n", + (unsigned long)GET_LE(&alt_sec->sh_size)); + } + for (i = 0; i < NSYMS; i++) { + if (required_syms[i].export && syms[i]) + fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", + required_syms[i].name, (int64_t)syms[i]); + } + fprintf(outfile, "};\n"); +} diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c new file mode 100644 index 000000000000..e904c270573b --- /dev/null +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -0,0 +1,120 @@ +/* + * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar + * + * This file contains the needed initializations to support sysenter. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT 0 +#else +#define VDSO_DEFAULT 1 +#endif + +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; + +static int __init vdso32_setup(char *s) +{ + vdso32_enabled = simple_strtoul(s, NULL, 0); + + if (vdso32_enabled > 1) + pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + + return 1; +} + +/* + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO + * behavior on both 64-bit and 32-bit kernels. + * On 32-bit kernels, vdso=[012] means the same thing. + */ +__setup("vdso32=", vdso32_setup); + +#ifdef CONFIG_X86_32 +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); +#endif + +#ifdef CONFIG_X86_64 + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) + +#else /* CONFIG_X86_32 */ + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) +#define vdso32_syscall() (0) + +#endif /* CONFIG_X86_64 */ + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +const struct vdso_image *selected_vdso32; +#endif + +int __init sysenter_setup(void) +{ +#ifdef CONFIG_COMPAT + if (vdso32_syscall()) + selected_vdso32 = &vdso_image_32_syscall; + else +#endif + if (vdso32_sysenter()) + selected_vdso32 = &vdso_image_32_sysenter; + else + selected_vdso32 = &vdso_image_32_int80; + + init_vdso_image(selected_vdso32); + + return 0; +} + +#ifdef CONFIG_X86_64 + +subsys_initcall(sysenter_setup); + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include + +static struct ctl_table abi_table2[] = { + { + .procname = "vsyscall32", + .data = &vdso32_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} +}; + +static struct ctl_table abi_root_table2[] = { + { + .procname = "abi", + .mode = 0555, + .child = abi_table2 + }, + {} +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2); + return 0; +} +__initcall(ia32_binfmt_init); +#endif /* CONFIG_SYSCTL */ + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore new file mode 100644 index 000000000000..e45fba9d0ced --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/.gitignore @@ -0,0 +1 @@ +vdso32.lds diff --git a/arch/x86/entry/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S new file mode 100644 index 000000000000..b15b7c01aedb --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/int80.S @@ -0,0 +1,56 @@ +/* + * Code for the vDSO. This version uses the old int $0x80 method. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + int $0x80 + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + .align 4 +.LENDFDEDLSI: + .previous + + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .previous diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S new file mode 100644 index 000000000000..c83f25734696 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -0,0 +1,44 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include + +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END + +#ifdef CONFIG_XEN +/* + * Add a special note telling glibc's dynamic linker a fake hardware + * flavor that it will use to choose the search path for libraries in the + * same way it uses real hardware capabilities like "mmx". + * We supply "nosegneg" as the fake capability, to indicate that we + * do not like negative offsets in instructions using segment overrides, + * since we implement those inefficiently. This makes it possible to + * install libraries optimized to avoid those access patterns in someplace + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file + * corresponding to the bits here is needed to make ldconfig work right. + * It should contain: + * hwcap 1 nosegneg + * to match the mapping of bit to name that we give here. + * + * At runtime, the fake hardware feature will be considered to be present + * if its bit is set in the mask word. So, we start with the mask 0, and + * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. + */ + +#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ + +ELFNOTE_START(GNU, 2, "a") + .long 1 /* ncaps */ +VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ + .long 0 /* mask */ + .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ +ELFNOTE_END +#endif diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S new file mode 100644 index 000000000000..d7ec4e251c0a --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -0,0 +1,145 @@ +/* + * Common code for the sigreturn entry points in vDSO images. + * So far this code is the same for both int80 and sysenter versions. + * This file is #include'd by int80.S et al to define them first thing. + * The kernel assumes that the addresses of these routines are constant + * for all vDSO implementations. + */ + +#include +#include +#include + +#ifndef SYSCALL_ENTER_KERNEL +#define SYSCALL_ENTER_KERNEL int $0x80 +#endif + + .text + .globl __kernel_sigreturn + .type __kernel_sigreturn,@function + nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ + ALIGN +__kernel_sigreturn: +.LSTART_sigreturn: + popl %eax /* XXX does this mean it needs unwind info? */ + movl $__NR_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_sigreturn: + nop + .size __kernel_sigreturn,.-.LSTART_sigreturn + + .globl __kernel_rt_sigreturn + .type __kernel_rt_sigreturn,@function + ALIGN +__kernel_rt_sigreturn: +.LSTART_rt_sigreturn: + movl $__NR_rt_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_rt_sigreturn: + nop + .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI1: + .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 +.LSTARTCIEDLSI1: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zRS" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0 /* DW_CFA_nop */ + .align 4 +.LENDCIEDLSI1: + .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ +.LSTARTFDEDLSI1: + .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: The dwarf2 unwind routines will subtract 1 from the + return address to get an address in the middle of the + presumed call instruction. Since we didn't get here via + a call, we need to include the nop before the real start + to make up for it. */ + .long .LSTART_sigreturn-1-. /* PC-relative start address */ + .long .LEND_sigreturn-.LSTART_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + complicated by the fact that the "CFA" is always assumed to + be the value of the stack pointer in the caller. This means + that we must define the CFA of this body of code to be the + saved value of the stack pointer in the sigcontext. Which + also means that there is no fixed relation to the other + saved registers, which means that we must use DW_CFA_expression + to compute their addresses. It also means that when we + adjust the stack with the popl, we have to do it all over again. */ + +#define do_cfa_expr(offset) \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ + .byte 0x06; /* DW_OP_deref */ \ +1: + +#define do_expr(regno, offset) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ +1: + + do_cfa_expr(IA32_SIGCONTEXT_sp+4) + do_expr(0, IA32_SIGCONTEXT_ax+4) + do_expr(1, IA32_SIGCONTEXT_cx+4) + do_expr(2, IA32_SIGCONTEXT_dx+4) + do_expr(3, IA32_SIGCONTEXT_bx+4) + do_expr(5, IA32_SIGCONTEXT_bp+4) + do_expr(6, IA32_SIGCONTEXT_si+4) + do_expr(7, IA32_SIGCONTEXT_di+4) + do_expr(8, IA32_SIGCONTEXT_ip+4) + + .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ + + do_cfa_expr(IA32_SIGCONTEXT_sp) + do_expr(0, IA32_SIGCONTEXT_ax) + do_expr(1, IA32_SIGCONTEXT_cx) + do_expr(2, IA32_SIGCONTEXT_dx) + do_expr(3, IA32_SIGCONTEXT_bx) + do_expr(5, IA32_SIGCONTEXT_bp) + do_expr(6, IA32_SIGCONTEXT_si) + do_expr(7, IA32_SIGCONTEXT_di) + do_expr(8, IA32_SIGCONTEXT_ip) + + .align 4 +.LENDFDEDLSI1: + + .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ +.LSTARTFDEDLSI2: + .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: See above wrt unwind library assumptions. */ + .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ + .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + slightly less complicated than the above, since we don't + modify the stack pointer in the process. */ + + do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) + do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) + do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) + do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) + do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) + do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) + do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) + do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) + do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) + + .align 4 +.LENDFDEDLSI2: + .previous diff --git a/arch/x86/entry/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S new file mode 100644 index 000000000000..6b286bb5251c --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/syscall.S @@ -0,0 +1,75 @@ +/* + * Code for the vDSO. This version uses the syscall instruction. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#define SYSCALL_ENTER_KERNEL syscall +#include "sigreturn.S" + +#include + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + push %ebp +.Lpush_ebp: + movl %ecx, %ebp + syscall + movl %ebp, %ecx + popl %ebp +.Lpop_ebp: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + + .section .eh_frame,"a",@progbits +.LSTARTFRAME: + .long .LENDCIE-.LSTARTCIE +.LSTARTCIE: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIE: + + .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ +.LSTARTFDE1: + .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 /* Augmentation length */ + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .uleb128 8 + .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ + .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .uleb128 4 + .align 4 +.LENDFDE1: + .previous + + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 + .previous diff --git a/arch/x86/entry/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S new file mode 100644 index 000000000000..e354bceee0e0 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/sysenter.S @@ -0,0 +1,116 @@ +/* + * Code for the vDSO. This version uses the sysenter instruction. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + +/* + * The caller puts arg2 in %ecx, which gets pushed. The kernel will use + * %ecx itself for arg2. The pushing is because the sysexit instruction + * (found in entry.S) requires that we clobber %ecx with the desired %esp. + * User code might expect that %ecx is unclobbered though, as it would be + * for returning via the iret instruction, so we must push and pop. + * + * The caller puts arg3 in %edx, which the sysexit instruction requires + * for %eip. Thus, exactly as for arg2, we must push and pop. + * + * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter + * instruction clobbers %esp, the user's %esp won't even survive entry + * into the kernel. We store %esp in %ebp. Code in entry.S must fetch + * arg6 from the stack. + * + * You can not use this vsyscall for the clone() syscall because the + * three words on the parent stack do not get copied to the child. + */ + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + push %ecx +.Lpush_ecx: + push %edx +.Lpush_edx: + push %ebp +.Lenter_kernel: + movl %esp,%ebp + sysenter + + /* 7: align return point with nop's to make disassembly easier */ + .space 7,0x90 + + /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ + int $0x80 + /* 16: System call normal return point is here! */ +VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ + pop %ebp +.Lpop_ebp: + pop %edx +.Lpop_edx: + pop %ecx +.Lpop_ecx: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x10 /* RA at offset 16 now */ + .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ + /* Finally the epilogue. */ + .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x04 /* RA at offset 4 now */ + .align 4 +.LENDFDEDLSI: + .previous + + /* + * Emit a symbol with the size of this .eh_frame data, + * to verify it matches the other versions. + */ +VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c new file mode 100644 index 000000000000..175cc72c0f68 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -0,0 +1,30 @@ +#define BUILD_VDSO32 + +#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE +#undef CONFIG_OPTIMIZE_INLINING +#endif + +#undef CONFIG_X86_PPRO_FENCE + +#ifdef CONFIG_X86_64 + +/* + * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel + * configuration + */ +#undef CONFIG_64BIT +#undef CONFIG_X86_64 +#undef CONFIG_ILLEGAL_POINTER_VALUE +#undef CONFIG_SPARSEMEM_VMEMMAP +#undef CONFIG_NR_CPUS + +#define CONFIG_X86_32 1 +#define CONFIG_PAGE_OFFSET 0 +#define CONFIG_ILLEGAL_POINTER_VALUE 0 +#define CONFIG_NR_CPUS 1 + +#define BUILD_VDSO32_64 + +#endif + +#include "../vclock_gettime.c" diff --git a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c new file mode 100644 index 000000000000..541468e25265 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c @@ -0,0 +1 @@ +#include "../vdso-fakesections.c" diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S new file mode 100644 index 000000000000..31056cf294bf --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -0,0 +1,37 @@ +/* + * Linker script for 32-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#include + +#define BUILD_VDSO32 + +#include "../vdso-layout.lds.S" + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION +{ + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + }; + + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S new file mode 100644 index 000000000000..697c11ece90c --- /dev/null +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -0,0 +1,25 @@ +/* + * Linker script for x32 vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSOX32 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_getcpu; + __vdso_time; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c new file mode 100644 index 000000000000..8ec3d1f4ce9a --- /dev/null +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -0,0 +1,28 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of getcpu() + */ + +#include +#include +#include +#include + +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + unsigned int p; + + p = __getcpu(); + + if (cpu) + *cpu = p & VGETCPU_CPU_MASK; + if (node) + *node = p >> 12; + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c new file mode 100644 index 000000000000..1c9f750c3859 --- /dev/null +++ b/arch/x86/entry/vdso/vma.c @@ -0,0 +1,300 @@ +/* + * Copyright 2007 Andi Kleen, SUSE Labs. + * Subject to the GPL, v.2 + * + * This contains most of the x86 vDSO kernel-side code. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_X86_64) +unsigned int __read_mostly vdso64_enabled = 1; +#endif + +void __init init_vdso_image(const struct vdso_image *image) +{ + int i; + int npages = (image->size) / PAGE_SIZE; + + BUG_ON(image->size % PAGE_SIZE != 0); + for (i = 0; i < npages; i++) + image->text_mapping.pages[i] = + virt_to_page(image->data + i*PAGE_SIZE); + + apply_alternatives((struct alt_instr *)(image->data + image->alt), + (struct alt_instr *)(image->data + image->alt + + image->alt_len)); +} + +struct linux_binprm; + +/* + * Put the vdso above the (randomized) stack with another randomized + * offset. This way there is no hole in the middle of address space. + * To save memory make sure it is still in the same PTE as the stack + * top. This doesn't give that many random bits. + * + * Note that this algorithm is imperfect: the distribution of the vdso + * start address within a PMD is biased toward the end. + * + * Only used for the 64-bit and x32 vdsos. + */ +static unsigned long vdso_addr(unsigned long start, unsigned len) +{ +#ifdef CONFIG_X86_32 + return 0; +#else + unsigned long addr, end; + unsigned offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= TASK_SIZE_MAX) + end = TASK_SIZE_MAX; + end -= len; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + + /* + * Forcibly align the final address in case we have a hardware + * issue that requires alignment for performance reasons. + */ + addr = align_vdso_addr(addr); + + return addr; +#endif +} + +static int map_vdso(const struct vdso_image *image, bool calculate_addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long addr, text_start; + int ret = 0; + static struct page *no_pages[] = {NULL}; + static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .pages = no_pages, + }; + + if (calculate_addr) { + addr = vdso_addr(current->mm->start_stack, + image->size - image->sym_vvar_start); + } else { + addr = 0; + } + + down_write(&mm->mmap_sem); + + addr = get_unmapped_area(NULL, addr, + image->size - image->sym_vvar_start, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + text_start = addr - image->sym_vvar_start; + current->mm->context.vdso = (void __user *)text_start; + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + vma = _install_special_mapping(mm, + text_start, + image->size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &image->text_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + vma = _install_special_mapping(mm, + addr, + -image->sym_vvar_start, + VM_READ|VM_MAYREAD, + &vvar_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + if (image->sym_vvar_page) + ret = remap_pfn_range(vma, + text_start + image->sym_vvar_page, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + +#ifdef CONFIG_HPET_TIMER + if (hpet_address && image->sym_hpet_page) { + ret = io_remap_pfn_range(vma, + text_start + image->sym_hpet_page, + hpet_address >> PAGE_SHIFT, + PAGE_SIZE, + pgprot_noncached(PAGE_READONLY)); + + if (ret) + goto up_fail; + } +#endif + +up_fail: + if (ret) + current->mm->context.vdso = NULL; + + up_write(&mm->mmap_sem); + return ret; +} + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +static int load_vdso32(void) +{ + int ret; + + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ + return 0; + + ret = map_vdso(selected_vdso32, false); + if (ret) + return ret; + + if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) + current_thread_info()->sysenter_return = + current->mm->context.vdso + + selected_vdso32->sym_VDSO32_SYSENTER_RETURN; + + return 0; +} +#endif + +#ifdef CONFIG_X86_64 +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_64, true); +} + +#ifdef CONFIG_COMPAT +int compat_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp) +{ +#ifdef CONFIG_X86_X32_ABI + if (test_thread_flag(TIF_X32)) { + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_x32, true); + } +#endif + + return load_vdso32(); +} +#endif +#else +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + return load_vdso32(); +} +#endif + +#ifdef CONFIG_X86_64 +static __init int vdso_setup(char *s) +{ + vdso64_enabled = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("vdso=", vdso_setup); +#endif + +#ifdef CONFIG_X86_64 +static void vgetcpu_cpu_init(void *arg) +{ + int cpu = smp_processor_id(); + struct desc_struct d = { }; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node(cpu); +#endif + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); + + /* + * Store cpu number in limit so that it can be loaded + * quickly in user space in vgetcpu. (12 bits for the CPU + * and 8 bits for the node) + */ + d.limit0 = cpu | ((node & 0xf) << 12); + d.limit = node >> 4; + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); +} + +static int +vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); + + return NOTIFY_DONE; +} + +static int __init init_vdso(void) +{ + init_vdso_image(&vdso_image_64); + +#ifdef CONFIG_X86_X32_ABI + init_vdso_image(&vdso_image_x32); +#endif + + cpu_notifier_register_begin(); + + on_each_cpu(vgetcpu_cpu_init, NULL, 1); + /* notifier priority > KVM */ + __hotcpu_notifier(vgetcpu_cpu_notifier, 30); + + cpu_notifier_register_done(); + + return 0; +} +subsys_initcall(init_vdso); +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore deleted file mode 100644 index aae8ffdd5880..000000000000 --- a/arch/x86/vdso/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -vdso.lds -vdsox32.lds -vdso32-syscall-syms.lds -vdso32-sysenter-syms.lds -vdso32-int80-syms.lds -vdso-image-*.c -vdso2c diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile deleted file mode 100644 index e97032069f88..000000000000 --- a/arch/x86/vdso/Makefile +++ /dev/null @@ -1,209 +0,0 @@ -# -# Building vDSO images for x86. -# - -KBUILD_CFLAGS += $(DISABLE_LTO) -KASAN_SANITIZE := n - -VDSO64-$(CONFIG_X86_64) := y -VDSOX32-$(CONFIG_X86_X32_ABI) := y -VDSO32-$(CONFIG_X86_32) := y -VDSO32-$(CONFIG_COMPAT) := y - -# files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o - -# files to link into kernel -obj-y += vma.o - -# vDSO images to build -vdso_img-$(VDSO64-y) += 64 -vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32-int80 -vdso_img-$(CONFIG_COMPAT) += 32-syscall -vdso_img-$(VDSO32-y) += 32-sysenter - -obj-$(VDSO32-y) += vdso32-setup.o - -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) - -$(obj)/vdso.o: $(obj)/vdso.so - -targets += vdso.lds $(vobjs-y) - -# Build the vDSO image C files and link them in. -vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) -vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) -vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) -obj-y += $(vdso_img_objs) -targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) -.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ - $(vdso_img-y:%=$(obj)/vdso%.so) - -export CPPFLAGS_vdso.lds += -P -C - -VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ - -Wl,--no-undefined \ - -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ - $(DISABLE_LTO) - -$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE - $(call if_changed,vdso) - -HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi -hostprogs-y += vdso2c - -quiet_cmd_vdso2c = VDSO2C $@ -define cmd_vdso2c - $(obj)/vdso2c $< $(<:%.dbg=%) $@ -endef - -$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE - $(call if_changed,vdso2c) - -# -# Don't omit frame pointers for ease of userspace debugging, but do -# optimize sibling calls. -# -CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ - $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ - -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING - -$(vobjs): KBUILD_CFLAGS += $(CFL) - -# -# vDSO code runs in userspace and -pg doesn't help with profiling anyway. -# -CFLAGS_REMOVE_vdso-note.o = -pg -CFLAGS_REMOVE_vclock_gettime.o = -pg -CFLAGS_REMOVE_vgetcpu.o = -pg -CFLAGS_REMOVE_vvar.o = -pg - -# -# X32 processes use x32 vDSO to access 64bit kernel data. -# -# Build x32 vDSO image: -# 1. Compile x32 vDSO as 64bit. -# 2. Convert object files to x32. -# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes -# so that it can reach 64bit address space with 64bit pointers. -# - -CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ - -Wl,-soname=linux-vdso.so.1 \ - -Wl,-z,max-page-size=4096 \ - -Wl,-z,common-page-size=4096 - -# 64-bit objects to re-brand as x32 -vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y)) - -# x32-rebranded versions -vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o) - -# same thing, but in the output directory -vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) - -# Convert 64bit object file to x32 for x32 vDSO. -quiet_cmd_x32 = X32 $@ - cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@ - -$(obj)/%-x32.o: $(obj)/%.o FORCE - $(call if_changed,x32) - -targets += vdsox32.lds $(vobjx32s-y) - -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg - $(call if_changed,objcopy) - -$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE - $(call if_changed,vdso) - -# -# Build multiple 32-bit vDSO images to choose from at boot time. -# -vdso32.so-$(VDSO32-y) += int80 -vdso32.so-$(CONFIG_COMPAT) += syscall -vdso32.so-$(VDSO32-y) += sysenter - -vdso32-images = $(vdso32.so-y:%=vdso32-%.so) - -CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 - -# This makes sure the $(obj) subdirectory exists even though vdso32/ -# is not a kbuild sub-make subdirectory. -override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ - -targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) -targets += vdso32/vclock_gettime.o - -$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) - -KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) -$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 - -KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic -KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) -KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) -KBUILD_CFLAGS_32 += -fno-omit-frame-pointer -KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) - -$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/%.o - $(call if_changed,vdso) - -# -# The DSO images are built using a special linker script. -# -quiet_cmd_vdso = VDSO $@ - cmd_vdso = $(CC) -nostdlib -o $@ \ - $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' - -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ - $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) -GCOV_PROFILE := n - -# -# Install the unstripped copies of vdso*.so. If our toolchain supports -# build-id, install .build-id links as well. -# -quiet_cmd_vdso_install = INSTALL $(@:install_%=%) -define cmd_vdso_install - cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ - if readelf -n $< |grep -q 'Build ID'; then \ - buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ - first=`echo $$buildid | cut -b-2`; \ - last=`echo $$buildid | cut -b3-`; \ - mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ - ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ - fi -endef - -vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) - -$(MODLIB)/vdso: FORCE - @mkdir -p $(MODLIB)/vdso - -$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE - $(call cmd,vdso_install) - -PHONY += vdso_install $(vdso_img_insttargets) -vdso_install: $(vdso_img_insttargets) FORCE - -clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh deleted file mode 100755 index 7ee90a9b549d..000000000000 --- a/arch/x86/vdso/checkundef.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -nm="$1" -file="$2" -$nm "$file" | grep '^ *U' > /dev/null 2>&1 -if [ $? -eq 1 ]; then - exit 0 -else - echo "$file: undefined symbols found" >&2 - exit 1 -fi diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c deleted file mode 100644 index 9793322751e0..000000000000 --- a/arch/x86/vdso/vclock_gettime.c +++ /dev/null @@ -1,351 +0,0 @@ -/* - * Copyright 2006 Andi Kleen, SUSE Labs. - * Subject to the GNU Public License, v.2 - * - * Fast user context implementation of clock_gettime, gettimeofday, and time. - * - * 32 Bit compat layer by Stefani Seibold - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * The code should have no internal unresolved relocations. - * Check with readelf after changing. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define gtod (&VVAR(vsyscall_gtod_data)) - -extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); -extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); -extern time_t __vdso_time(time_t *t); - -#ifdef CONFIG_HPET_TIMER -extern u8 hpet_page - __attribute__((visibility("hidden"))); - -static notrace cycle_t vread_hpet(void) -{ - return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); -} -#endif - -#ifndef BUILD_VDSO32 - -#include -#include -#include -#include - -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) -{ - long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); - return ret; -} - -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); - return ret; -} - -#ifdef CONFIG_PARAVIRT_CLOCK - -static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) -{ - const struct pvclock_vsyscall_time_info *pvti_base; - int idx = cpu / (PAGE_SIZE/PVTI_SIZE); - int offset = cpu % (PAGE_SIZE/PVTI_SIZE); - - BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); - - pvti_base = (struct pvclock_vsyscall_time_info *) - __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); - - return &pvti_base[offset]; -} - -static notrace cycle_t vread_pvclock(int *mode) -{ - const struct pvclock_vsyscall_time_info *pvti; - cycle_t ret; - u64 last; - u32 version; - u8 flags; - unsigned cpu, cpu1; - - - /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. - * - */ - do { - cpu = __getcpu() & VGETCPU_CPU_MASK; - /* TODO: We can put vcpu id into higher bits of pvti.version. - * This will save a couple of cycles by getting rid of - * __getcpu() calls (Gleb). - */ - - pvti = get_pvti(cpu); - - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); - - /* - * Test we're still on the cpu as well as the version. - * We could have been migrated just after the first - * vgetcpu but before fetching the version, so we - * wouldn't notice a version change. - */ - cpu1 = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1 || - (pvti->pvti.version & 1) || - pvti->pvti.version != version)); - - if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) - *mode = VCLOCK_NONE; - - /* refer to tsc.c read_tsc() comment for rationale */ - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - return last; -} -#endif - -#else - -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) - : "memory", "edx"); - return ret; -} - -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) - : "memory", "edx"); - return ret; -} - -#ifdef CONFIG_PARAVIRT_CLOCK - -static notrace cycle_t vread_pvclock(int *mode) -{ - *mode = VCLOCK_NONE; - return 0; -} -#endif - -#endif - -notrace static cycle_t vread_tsc(void) -{ - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)__native_read_tsc(); - - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - -notrace static inline u64 vgetsns(int *mode) -{ - u64 v; - cycles_t cycles; - - if (gtod->vclock_mode == VCLOCK_TSC) - cycles = vread_tsc(); -#ifdef CONFIG_HPET_TIMER - else if (gtod->vclock_mode == VCLOCK_HPET) - cycles = vread_hpet(); -#endif -#ifdef CONFIG_PARAVIRT_CLOCK - else if (gtod->vclock_mode == VCLOCK_PVCLOCK) - cycles = vread_pvclock(mode); -#endif - else - return 0; - v = (cycles - gtod->cycle_last) & gtod->mask; - return v * gtod->mult; -} - -/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ -notrace static int __always_inline do_realtime(struct timespec *ts) -{ - unsigned long seq; - u64 ns; - int mode; - - do { - seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->wall_time_sec; - ns = gtod->wall_time_snsec; - ns += vgetsns(&mode); - ns >>= gtod->shift; - } while (unlikely(gtod_read_retry(gtod, seq))); - - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return mode; -} - -notrace static int __always_inline do_monotonic(struct timespec *ts) -{ - unsigned long seq; - u64 ns; - int mode; - - do { - seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->monotonic_time_sec; - ns = gtod->monotonic_time_snsec; - ns += vgetsns(&mode); - ns >>= gtod->shift; - } while (unlikely(gtod_read_retry(gtod, seq))); - - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return mode; -} - -notrace static void do_realtime_coarse(struct timespec *ts) -{ - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->wall_time_coarse_sec; - ts->tv_nsec = gtod->wall_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} - -notrace static void do_monotonic_coarse(struct timespec *ts) -{ - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->monotonic_time_coarse_sec; - ts->tv_nsec = gtod->monotonic_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} - -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) -{ - switch (clock) { - case CLOCK_REALTIME: - if (do_realtime(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_MONOTONIC: - if (do_monotonic(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_REALTIME_COARSE: - do_realtime_coarse(ts); - break; - case CLOCK_MONOTONIC_COARSE: - do_monotonic_coarse(ts); - break; - default: - goto fallback; - } - - return 0; -fallback: - return vdso_fallback_gettime(clock, ts); -} -int clock_gettime(clockid_t, struct timespec *) - __attribute__((weak, alias("__vdso_clock_gettime"))); - -notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - if (likely(tv != NULL)) { - if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) - return vdso_fallback_gtod(tv, tz); - tv->tv_usec /= 1000; - } - if (unlikely(tz != NULL)) { - tz->tz_minuteswest = gtod->tz_minuteswest; - tz->tz_dsttime = gtod->tz_dsttime; - } - - return 0; -} -int gettimeofday(struct timeval *, struct timezone *) - __attribute__((weak, alias("__vdso_gettimeofday"))); - -/* - * This will break when the xtime seconds get inaccurate, but that is - * unlikely - */ -notrace time_t __vdso_time(time_t *t) -{ - /* This is atomic on x86 so we don't need any locks. */ - time_t result = ACCESS_ONCE(gtod->wall_time_sec); - - if (t) - *t = result; - return result; -} -int time(time_t *t) - __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S deleted file mode 100644 index de2c921025f5..000000000000 --- a/arch/x86/vdso/vdso-layout.lds.S +++ /dev/null @@ -1,118 +0,0 @@ -#include - -/* - * Linker script for vDSO. This is an ELF shared object prelinked to - * its virtual address, and with only one read-only segment. - * This script controls its layout. - */ - -#if defined(BUILD_VDSO64) -# define SHDR_SIZE 64 -#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32) -# define SHDR_SIZE 40 -#else -# error unknown VDSO target -#endif - -#define NUM_FAKE_SHDRS 13 - -SECTIONS -{ - /* - * User/kernel shared data is before the vDSO. This may be a little - * uglier than putting it after the vDSO, but it avoids issues with - * non-allocatable things that dangle past the end of the PT_LOAD - * segment. - */ - - vvar_start = . - 2 * PAGE_SIZE; - vvar_page = vvar_start; - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#define __VVAR_KERNEL_LDS -#include -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR - - hpet_page = vvar_start + PAGE_SIZE; - - . = SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - .dynamic : { *(.dynamic) } :text :dynamic - - .rodata : { - *(.rodata*) - *(.data*) - *(.sdata*) - *(.got.plt) *(.got) - *(.gnu.linkonce.d.*) - *(.bss*) - *(.dynbss*) - *(.gnu.linkonce.b.*) - - /* - * Ideally this would live in a C file, but that won't - * work cleanly for x32 until we start building the x32 - * C code using an x32 toolchain. - */ - VDSO_FAKE_SECTION_TABLE_START = .; - . = . + NUM_FAKE_SHDRS * SHDR_SIZE; - VDSO_FAKE_SECTION_TABLE_END = .; - } :text - - .fake_shstrtab : { *(.fake_shstrtab) } :text - - - .note : { *(.note.*) } :text :note - - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - - - /* - * Text is well-separated from actual data: there's plenty of - * stuff that isn't used at runtime in between. - */ - - .text : { *(.text*) } :text =0x90909090, - - /* - * At the end so that eu-elflint stays happy when vdso2c strips - * these. A better implementation would avoid allocating space - * for these. - */ - .altinstructions : { *(.altinstructions) } :text - .altinstr_replacement : { *(.altinstr_replacement) } :text - - /DISCARD/ : { - *(.discard) - *(.discard.*) - *(__bug_table) - } -} - -/* - * Very old versions of ld do not recognize this name token; use the constant. - */ -#define PT_GNU_EH_FRAME 0x6474e550 - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr PT_GNU_EH_FRAME; -} diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/vdso/vdso-note.S deleted file mode 100644 index 79a071e4357e..000000000000 --- a/arch/x86/vdso/vdso-note.S +++ /dev/null @@ -1,12 +0,0 @@ -/* - * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. - * Here we can supply some information useful to userland. - */ - -#include -#include -#include - -ELFNOTE_START(Linux, 0, "a") - .long LINUX_VERSION_CODE -ELFNOTE_END diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S deleted file mode 100644 index 6807932643c2..000000000000 --- a/arch/x86/vdso/vdso.lds.S +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Linker script for 64-bit vDSO. - * We #include the file to define the layout details. - * - * This file defines the version script giving the user-exported symbols in - * the DSO. - */ - -#define BUILD_VDSO64 - -#include "vdso-layout.lds.S" - -/* - * This controls what userland symbols we export from the vDSO. - */ -VERSION { - LINUX_2.6 { - global: - clock_gettime; - __vdso_clock_gettime; - gettimeofday; - __vdso_gettimeofday; - getcpu; - __vdso_getcpu; - time; - __vdso_time; - local: *; - }; -} diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c deleted file mode 100644 index 8627db24a7f6..000000000000 --- a/arch/x86/vdso/vdso2c.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * vdso2c - A vdso image preparation tool - * Copyright (c) 2014 Andy Lutomirski and others - * Licensed under the GPL v2 - * - * vdso2c requires stripped and unstripped input. It would be trivial - * to fully strip the input in here, but, for reasons described below, - * we need to write a section table. Doing this is more or less - * equivalent to dropping all non-allocatable sections, but it's - * easier to let objcopy handle that instead of doing it ourselves. - * If we ever need to do something fancier than what objcopy provides, - * it would be straightforward to add here. - * - * We're keep a section table for a few reasons: - * - * The Go runtime had a couple of bugs: it would read the section - * table to try to figure out how many dynamic symbols there were (it - * shouldn't have looked at the section table at all) and, if there - * were no SHT_SYNDYM section table entry, it would use an - * uninitialized value for the number of symbols. An empty DYNSYM - * table would work, but I see no reason not to write a valid one (and - * keep full performance for old Go programs). This hack is only - * needed on x86_64. - * - * The bug was introduced on 2012-08-31 by: - * https://code.google.com/p/go/source/detail?r=56ea40aac72b - * and was fixed on 2014-06-13 by: - * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 - * - * Binutils has issues debugging the vDSO: it reads the section table to - * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which - * would break build-id if we removed the section table. Binutils - * also requires that shstrndx != 0. See: - * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 - * - * elfutils might not look for PT_NOTE if there is a section table at - * all. I don't know whether this matters for any practical purpose. - * - * For simplicity, rather than hacking up a partial section table, we - * just write a mostly complete one. We omit non-dynamic symbols, - * though, since they're rather large. - * - * Once binutils gets fixed, we might be able to drop this for all but - * the 64-bit vdso, since build-id only works in kernel RPMs, and - * systems that update to new enough kernel RPMs will likely update - * binutils in sync. build-id has never worked for home-built kernel - * RPMs without manual symlinking, and I suspect that no one ever does - * that. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include -#include - -const char *outfilename; - -/* Symbols that we need in vdso2c. */ -enum { - sym_vvar_start, - sym_vvar_page, - sym_hpet_page, - sym_VDSO_FAKE_SECTION_TABLE_START, - sym_VDSO_FAKE_SECTION_TABLE_END, -}; - -const int special_pages[] = { - sym_vvar_page, - sym_hpet_page, -}; - -struct vdso_sym { - const char *name; - bool export; -}; - -struct vdso_sym required_syms[] = { - [sym_vvar_start] = {"vvar_start", true}, - [sym_vvar_page] = {"vvar_page", true}, - [sym_hpet_page] = {"hpet_page", true}, - [sym_VDSO_FAKE_SECTION_TABLE_START] = { - "VDSO_FAKE_SECTION_TABLE_START", false - }, - [sym_VDSO_FAKE_SECTION_TABLE_END] = { - "VDSO_FAKE_SECTION_TABLE_END", false - }, - {"VDSO32_NOTE_MASK", true}, - {"VDSO32_SYSENTER_RETURN", true}, - {"__kernel_vsyscall", true}, - {"__kernel_sigreturn", true}, - {"__kernel_rt_sigreturn", true}, -}; - -__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) -static void fail(const char *format, ...) -{ - va_list ap; - va_start(ap, format); - fprintf(stderr, "Error: "); - vfprintf(stderr, format, ap); - if (outfilename) - unlink(outfilename); - exit(1); - va_end(ap); -} - -/* - * Evil macros for little-endian reads and writes - */ -#define GLE(x, bits, ifnot) \ - __builtin_choose_expr( \ - (sizeof(*(x)) == bits/8), \ - (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot) - -extern void bad_get_le(void); -#define LAST_GLE(x) \ - __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le()) - -#define GET_LE(x) \ - GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x)))) - -#define PLE(x, val, bits, ifnot) \ - __builtin_choose_expr( \ - (sizeof(*(x)) == bits/8), \ - put_unaligned_le##bits((val), (x)), ifnot) - -extern void bad_put_le(void); -#define LAST_PLE(x, val) \ - __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le()) - -#define PUT_LE(x, val) \ - PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val)))) - - -#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) - -#define BITSFUNC3(name, bits, suffix) name##bits##suffix -#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) -#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) - -#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) - -#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x -#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) -#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) - -#define ELF_BITS 64 -#include "vdso2c.h" -#undef ELF_BITS - -#define ELF_BITS 32 -#include "vdso2c.h" -#undef ELF_BITS - -static void go(void *raw_addr, size_t raw_len, - void *stripped_addr, size_t stripped_len, - FILE *outfile, const char *name) -{ - Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; - - if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { - go64(raw_addr, raw_len, stripped_addr, stripped_len, - outfile, name); - } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { - go32(raw_addr, raw_len, stripped_addr, stripped_len, - outfile, name); - } else { - fail("unknown ELF class\n"); - } -} - -static void map_input(const char *name, void **addr, size_t *len, int prot) -{ - off_t tmp_len; - - int fd = open(name, O_RDONLY); - if (fd == -1) - err(1, "%s", name); - - tmp_len = lseek(fd, 0, SEEK_END); - if (tmp_len == (off_t)-1) - err(1, "lseek"); - *len = (size_t)tmp_len; - - *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); - if (*addr == MAP_FAILED) - err(1, "mmap"); - - close(fd); -} - -int main(int argc, char **argv) -{ - size_t raw_len, stripped_len; - void *raw_addr, *stripped_addr; - FILE *outfile; - char *name, *tmp; - int namelen; - - if (argc != 4) { - printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); - return 1; - } - - /* - * Figure out the struct name. If we're writing to a .so file, - * generate raw output insted. - */ - name = strdup(argv[3]); - namelen = strlen(name); - if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { - name = NULL; - } else { - tmp = strrchr(name, '/'); - if (tmp) - name = tmp + 1; - tmp = strchr(name, '.'); - if (tmp) - *tmp = '\0'; - for (tmp = name; *tmp; tmp++) - if (*tmp == '-') - *tmp = '_'; - } - - map_input(argv[1], &raw_addr, &raw_len, PROT_READ); - map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); - - outfilename = argv[3]; - outfile = fopen(outfilename, "w"); - if (!outfile) - err(1, "%s", argv[2]); - - go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); - - munmap(raw_addr, raw_len); - munmap(stripped_addr, stripped_len); - fclose(outfile); - - return 0; -} diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h deleted file mode 100644 index 0224987556ce..000000000000 --- a/arch/x86/vdso/vdso2c.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * This file is included twice from vdso2c.c. It generates code for 32-bit - * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs - * are built for 32-bit userspace. - */ - -static void BITSFUNC(go)(void *raw_addr, size_t raw_len, - void *stripped_addr, size_t stripped_len, - FILE *outfile, const char *name) -{ - int found_load = 0; - unsigned long load_size = -1; /* Work around bogus warning */ - unsigned long mapping_size; - ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; - int i; - unsigned long j; - ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, - *alt_sec = NULL; - ELF(Dyn) *dyn = 0, *dyn_end = 0; - const char *secstrings; - INT_BITS syms[NSYMS] = {}; - - ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); - - /* Walk the segment table. */ - for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { - if (GET_LE(&pt[i].p_type) == PT_LOAD) { - if (found_load) - fail("multiple PT_LOAD segs\n"); - - if (GET_LE(&pt[i].p_offset) != 0 || - GET_LE(&pt[i].p_vaddr) != 0) - fail("PT_LOAD in wrong place\n"); - - if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz)) - fail("cannot handle memsz != filesz\n"); - - load_size = GET_LE(&pt[i].p_memsz); - found_load = 1; - } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) { - dyn = raw_addr + GET_LE(&pt[i].p_offset); - dyn_end = raw_addr + GET_LE(&pt[i].p_offset) + - GET_LE(&pt[i].p_memsz); - } - } - if (!found_load) - fail("no PT_LOAD seg\n"); - - if (stripped_len < load_size) - fail("stripped input is too short\n"); - - /* Walk the dynamic table */ - for (i = 0; dyn + i < dyn_end && - GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { - typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag); - if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA || - tag == DT_RELENT || tag == DT_TEXTREL) - fail("vdso image contains dynamic relocations\n"); - } - - /* Walk the section table */ - secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); - secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset); - for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { - ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize) * i; - if (GET_LE(&sh->sh_type) == SHT_SYMTAB) - symtab_hdr = sh; - - if (!strcmp(secstrings + GET_LE(&sh->sh_name), - ".altinstructions")) - alt_sec = sh; - } - - if (!symtab_hdr) - fail("no symbol table\n"); - - strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); - - /* Walk the symbol table */ - for (i = 0; - i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); - i++) { - int k; - ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + - GET_LE(&symtab_hdr->sh_entsize) * i; - const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + - GET_LE(&sym->st_name); - - for (k = 0; k < NSYMS; k++) { - if (!strcmp(name, required_syms[k].name)) { - if (syms[k]) { - fail("duplicate symbol %s\n", - required_syms[k].name); - } - - /* - * Careful: we use negative addresses, but - * st_value is unsigned, so we rely - * on syms[k] being a signed type of the - * correct width. - */ - syms[k] = GET_LE(&sym->st_value); - } - } - } - - /* Validate mapping addresses. */ - for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { - INT_BITS symval = syms[special_pages[i]]; - - if (!symval) - continue; /* The mapping isn't used; ignore it. */ - - if (symval % 4096) - fail("%s must be a multiple of 4096\n", - required_syms[i].name); - if (symval + 4096 < syms[sym_vvar_start]) - fail("%s underruns vvar_start\n", - required_syms[i].name); - if (symval + 4096 > 0) - fail("%s is on the wrong side of the vdso text\n", - required_syms[i].name); - } - if (syms[sym_vvar_start] % 4096) - fail("vvar_begin must be a multiple of 4096\n"); - - if (!name) { - fwrite(stripped_addr, stripped_len, 1, outfile); - return; - } - - mapping_size = (stripped_len + 4095) / 4096 * 4096; - - fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); - fprintf(outfile, "#include \n"); - fprintf(outfile, "#include \n"); - fprintf(outfile, "#include \n"); - fprintf(outfile, "\n"); - fprintf(outfile, - "static unsigned char raw_data[%lu] __page_aligned_data = {", - mapping_size); - for (j = 0; j < stripped_len; j++) { - if (j % 10 == 0) - fprintf(outfile, "\n\t"); - fprintf(outfile, "0x%02X, ", - (int)((unsigned char *)stripped_addr)[j]); - } - fprintf(outfile, "\n};\n\n"); - - fprintf(outfile, "static struct page *pages[%lu];\n\n", - mapping_size / 4096); - - fprintf(outfile, "const struct vdso_image %s = {\n", name); - fprintf(outfile, "\t.data = raw_data,\n"); - fprintf(outfile, "\t.size = %lu,\n", mapping_size); - fprintf(outfile, "\t.text_mapping = {\n"); - fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); - fprintf(outfile, "\t\t.pages = pages,\n"); - fprintf(outfile, "\t},\n"); - if (alt_sec) { - fprintf(outfile, "\t.alt = %lu,\n", - (unsigned long)GET_LE(&alt_sec->sh_offset)); - fprintf(outfile, "\t.alt_len = %lu,\n", - (unsigned long)GET_LE(&alt_sec->sh_size)); - } - for (i = 0; i < NSYMS; i++) { - if (required_syms[i].export && syms[i]) - fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", - required_syms[i].name, (int64_t)syms[i]); - } - fprintf(outfile, "};\n"); -} diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c deleted file mode 100644 index e904c270573b..000000000000 --- a/arch/x86/vdso/vdso32-setup.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * (C) Copyright 2002 Linus Torvalds - * Portions based on the vdso-randomization code from exec-shield: - * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar - * - * This file contains the needed initializations to support sysenter. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#ifdef CONFIG_COMPAT_VDSO -#define VDSO_DEFAULT 0 -#else -#define VDSO_DEFAULT 1 -#endif - -/* - * Should the kernel map a VDSO page into processes and pass its - * address down to glibc upon exec()? - */ -unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; - -static int __init vdso32_setup(char *s) -{ - vdso32_enabled = simple_strtoul(s, NULL, 0); - - if (vdso32_enabled > 1) - pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); - - return 1; -} - -/* - * For consistency, the argument vdso32=[012] affects the 32-bit vDSO - * behavior on both 64-bit and 32-bit kernels. - * On 32-bit kernels, vdso=[012] means the same thing. - */ -__setup("vdso32=", vdso32_setup); - -#ifdef CONFIG_X86_32 -__setup_param("vdso=", vdso_setup, vdso32_setup, 0); -#endif - -#ifdef CONFIG_X86_64 - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) -#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) - -#else /* CONFIG_X86_32 */ - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) -#define vdso32_syscall() (0) - -#endif /* CONFIG_X86_64 */ - -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) -const struct vdso_image *selected_vdso32; -#endif - -int __init sysenter_setup(void) -{ -#ifdef CONFIG_COMPAT - if (vdso32_syscall()) - selected_vdso32 = &vdso_image_32_syscall; - else -#endif - if (vdso32_sysenter()) - selected_vdso32 = &vdso_image_32_sysenter; - else - selected_vdso32 = &vdso_image_32_int80; - - init_vdso_image(selected_vdso32); - - return 0; -} - -#ifdef CONFIG_X86_64 - -subsys_initcall(sysenter_setup); - -#ifdef CONFIG_SYSCTL -/* Register vsyscall32 into the ABI table */ -#include - -static struct ctl_table abi_table2[] = { - { - .procname = "vsyscall32", - .data = &vdso32_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static struct ctl_table abi_root_table2[] = { - { - .procname = "abi", - .mode = 0555, - .child = abi_table2 - }, - {} -}; - -static __init int ia32_binfmt_init(void) -{ - register_sysctl_table(abi_root_table2); - return 0; -} -__initcall(ia32_binfmt_init); -#endif /* CONFIG_SYSCTL */ - -#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/vdso/vdso32/.gitignore deleted file mode 100644 index e45fba9d0ced..000000000000 --- a/arch/x86/vdso/vdso32/.gitignore +++ /dev/null @@ -1 +0,0 @@ -vdso32.lds diff --git a/arch/x86/vdso/vdso32/int80.S b/arch/x86/vdso/vdso32/int80.S deleted file mode 100644 index b15b7c01aedb..000000000000 --- a/arch/x86/vdso/vdso32/int80.S +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Code for the vDSO. This version uses the old int $0x80 method. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - int $0x80 - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 - .previous diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/vdso/vdso32/note.S deleted file mode 100644 index c83f25734696..000000000000 --- a/arch/x86/vdso/vdso32/note.S +++ /dev/null @@ -1,44 +0,0 @@ -/* - * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. - * Here we can supply some information useful to userland. - */ - -#include -#include - -/* Ideally this would use UTS_NAME, but using a quoted string here - doesn't work. Remember to change this when changing the - kernel's name. */ -ELFNOTE_START(Linux, 0, "a") - .long LINUX_VERSION_CODE -ELFNOTE_END - -#ifdef CONFIG_XEN -/* - * Add a special note telling glibc's dynamic linker a fake hardware - * flavor that it will use to choose the search path for libraries in the - * same way it uses real hardware capabilities like "mmx". - * We supply "nosegneg" as the fake capability, to indicate that we - * do not like negative offsets in instructions using segment overrides, - * since we implement those inefficiently. This makes it possible to - * install libraries optimized to avoid those access patterns in someplace - * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file - * corresponding to the bits here is needed to make ldconfig work right. - * It should contain: - * hwcap 1 nosegneg - * to match the mapping of bit to name that we give here. - * - * At runtime, the fake hardware feature will be considered to be present - * if its bit is set in the mask word. So, we start with the mask 0, and - * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. - */ - -#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ - -ELFNOTE_START(GNU, 2, "a") - .long 1 /* ncaps */ -VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ - .long 0 /* mask */ - .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ -ELFNOTE_END -#endif diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S deleted file mode 100644 index d7ec4e251c0a..000000000000 --- a/arch/x86/vdso/vdso32/sigreturn.S +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Common code for the sigreturn entry points in vDSO images. - * So far this code is the same for both int80 and sysenter versions. - * This file is #include'd by int80.S et al to define them first thing. - * The kernel assumes that the addresses of these routines are constant - * for all vDSO implementations. - */ - -#include -#include -#include - -#ifndef SYSCALL_ENTER_KERNEL -#define SYSCALL_ENTER_KERNEL int $0x80 -#endif - - .text - .globl __kernel_sigreturn - .type __kernel_sigreturn,@function - nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ - ALIGN -__kernel_sigreturn: -.LSTART_sigreturn: - popl %eax /* XXX does this mean it needs unwind info? */ - movl $__NR_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_sigreturn: - nop - .size __kernel_sigreturn,.-.LSTART_sigreturn - - .globl __kernel_rt_sigreturn - .type __kernel_rt_sigreturn,@function - ALIGN -__kernel_rt_sigreturn: -.LSTART_rt_sigreturn: - movl $__NR_rt_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_rt_sigreturn: - nop - .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI1: - .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 -.LSTARTCIEDLSI1: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zRS" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0 /* DW_CFA_nop */ - .align 4 -.LENDCIEDLSI1: - .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ -.LSTARTFDEDLSI1: - .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ - /* HACK: The dwarf2 unwind routines will subtract 1 from the - return address to get an address in the middle of the - presumed call instruction. Since we didn't get here via - a call, we need to include the nop before the real start - to make up for it. */ - .long .LSTART_sigreturn-1-. /* PC-relative start address */ - .long .LEND_sigreturn-.LSTART_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - complicated by the fact that the "CFA" is always assumed to - be the value of the stack pointer in the caller. This means - that we must define the CFA of this body of code to be the - saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other - saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we - adjust the stack with the popl, we have to do it all over again. */ - -#define do_cfa_expr(offset) \ - .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ - .byte 0x06; /* DW_OP_deref */ \ -1: - -#define do_expr(regno, offset) \ - .byte 0x10; /* DW_CFA_expression */ \ - .uleb128 regno; /* regno */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ -1: - - do_cfa_expr(IA32_SIGCONTEXT_sp+4) - do_expr(0, IA32_SIGCONTEXT_ax+4) - do_expr(1, IA32_SIGCONTEXT_cx+4) - do_expr(2, IA32_SIGCONTEXT_dx+4) - do_expr(3, IA32_SIGCONTEXT_bx+4) - do_expr(5, IA32_SIGCONTEXT_bp+4) - do_expr(6, IA32_SIGCONTEXT_si+4) - do_expr(7, IA32_SIGCONTEXT_di+4) - do_expr(8, IA32_SIGCONTEXT_ip+4) - - .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - - do_cfa_expr(IA32_SIGCONTEXT_sp) - do_expr(0, IA32_SIGCONTEXT_ax) - do_expr(1, IA32_SIGCONTEXT_cx) - do_expr(2, IA32_SIGCONTEXT_dx) - do_expr(3, IA32_SIGCONTEXT_bx) - do_expr(5, IA32_SIGCONTEXT_bp) - do_expr(6, IA32_SIGCONTEXT_si) - do_expr(7, IA32_SIGCONTEXT_di) - do_expr(8, IA32_SIGCONTEXT_ip) - - .align 4 -.LENDFDEDLSI1: - - .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ -.LSTARTFDEDLSI2: - .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ - /* HACK: See above wrt unwind library assumptions. */ - .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ - .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - slightly less complicated than the above, since we don't - modify the stack pointer in the process. */ - - do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) - do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) - do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) - do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) - do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) - do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) - do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) - do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) - do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) - - .align 4 -.LENDFDEDLSI2: - .previous diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S deleted file mode 100644 index 6b286bb5251c..000000000000 --- a/arch/x86/vdso/vdso32/syscall.S +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Code for the vDSO. This version uses the syscall instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#define SYSCALL_ENTER_KERNEL syscall -#include "sigreturn.S" - -#include - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ebp -.Lpush_ebp: - movl %ecx, %ebp - syscall - movl %ebp, %ecx - popl %ebp -.Lpop_ebp: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 8 - .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ - .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 4 - .align 4 -.LENDFDE1: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 - .previous diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S deleted file mode 100644 index e354bceee0e0..000000000000 --- a/arch/x86/vdso/vdso32/sysenter.S +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Code for the vDSO. This version uses the sysenter instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - -/* - * The caller puts arg2 in %ecx, which gets pushed. The kernel will use - * %ecx itself for arg2. The pushing is because the sysexit instruction - * (found in entry.S) requires that we clobber %ecx with the desired %esp. - * User code might expect that %ecx is unclobbered though, as it would be - * for returning via the iret instruction, so we must push and pop. - * - * The caller puts arg3 in %edx, which the sysexit instruction requires - * for %eip. Thus, exactly as for arg2, we must push and pop. - * - * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter - * instruction clobbers %esp, the user's %esp won't even survive entry - * into the kernel. We store %esp in %ebp. Code in entry.S must fetch - * arg6 from the stack. - * - * You can not use this vsyscall for the clone() syscall because the - * three words on the parent stack do not get copied to the child. - */ - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - - /* 7: align return point with nop's to make disassembly easier */ - .space 7,0x90 - - /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - int $0x80 - /* 16: System call normal return point is here! */ -VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Emit a symbol with the size of this .eh_frame data, - * to verify it matches the other versions. - */ -VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c deleted file mode 100644 index 175cc72c0f68..000000000000 --- a/arch/x86/vdso/vdso32/vclock_gettime.c +++ /dev/null @@ -1,30 +0,0 @@ -#define BUILD_VDSO32 - -#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE -#undef CONFIG_OPTIMIZE_INLINING -#endif - -#undef CONFIG_X86_PPRO_FENCE - -#ifdef CONFIG_X86_64 - -/* - * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel - * configuration - */ -#undef CONFIG_64BIT -#undef CONFIG_X86_64 -#undef CONFIG_ILLEGAL_POINTER_VALUE -#undef CONFIG_SPARSEMEM_VMEMMAP -#undef CONFIG_NR_CPUS - -#define CONFIG_X86_32 1 -#define CONFIG_PAGE_OFFSET 0 -#define CONFIG_ILLEGAL_POINTER_VALUE 0 -#define CONFIG_NR_CPUS 1 - -#define BUILD_VDSO32_64 - -#endif - -#include "../vclock_gettime.c" diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/vdso/vdso32/vdso-fakesections.c deleted file mode 100644 index 541468e25265..000000000000 --- a/arch/x86/vdso/vdso32/vdso-fakesections.c +++ /dev/null @@ -1 +0,0 @@ -#include "../vdso-fakesections.c" diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S deleted file mode 100644 index 31056cf294bf..000000000000 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Linker script for 32-bit vDSO. - * We #include the file to define the layout details. - * - * This file defines the version script giving the user-exported symbols in - * the DSO. - */ - -#include - -#define BUILD_VDSO32 - -#include "../vdso-layout.lds.S" - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); - -/* - * This controls what userland symbols we export from the vDSO. - */ -VERSION -{ - LINUX_2.6 { - global: - __vdso_clock_gettime; - __vdso_gettimeofday; - __vdso_time; - }; - - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - local: *; - }; -} diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S deleted file mode 100644 index 697c11ece90c..000000000000 --- a/arch/x86/vdso/vdsox32.lds.S +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Linker script for x32 vDSO. - * We #include the file to define the layout details. - * - * This file defines the version script giving the user-exported symbols in - * the DSO. - */ - -#define BUILD_VDSOX32 - -#include "vdso-layout.lds.S" - -/* - * This controls what userland symbols we export from the vDSO. - */ -VERSION { - LINUX_2.6 { - global: - __vdso_clock_gettime; - __vdso_gettimeofday; - __vdso_getcpu; - __vdso_time; - local: *; - }; -} diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c deleted file mode 100644 index 8ec3d1f4ce9a..000000000000 --- a/arch/x86/vdso/vgetcpu.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2006 Andi Kleen, SUSE Labs. - * Subject to the GNU Public License, v.2 - * - * Fast user context implementation of getcpu() - */ - -#include -#include -#include -#include - -notrace long -__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) -{ - unsigned int p; - - p = __getcpu(); - - if (cpu) - *cpu = p & VGETCPU_CPU_MASK; - if (node) - *node = p >> 12; - return 0; -} - -long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) - __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c deleted file mode 100644 index 1c9f750c3859..000000000000 --- a/arch/x86/vdso/vma.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Copyright 2007 Andi Kleen, SUSE Labs. - * Subject to the GPL, v.2 - * - * This contains most of the x86 vDSO kernel-side code. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_X86_64) -unsigned int __read_mostly vdso64_enabled = 1; -#endif - -void __init init_vdso_image(const struct vdso_image *image) -{ - int i; - int npages = (image->size) / PAGE_SIZE; - - BUG_ON(image->size % PAGE_SIZE != 0); - for (i = 0; i < npages; i++) - image->text_mapping.pages[i] = - virt_to_page(image->data + i*PAGE_SIZE); - - apply_alternatives((struct alt_instr *)(image->data + image->alt), - (struct alt_instr *)(image->data + image->alt + - image->alt_len)); -} - -struct linux_binprm; - -/* - * Put the vdso above the (randomized) stack with another randomized - * offset. This way there is no hole in the middle of address space. - * To save memory make sure it is still in the same PTE as the stack - * top. This doesn't give that many random bits. - * - * Note that this algorithm is imperfect: the distribution of the vdso - * start address within a PMD is biased toward the end. - * - * Only used for the 64-bit and x32 vdsos. - */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ -#ifdef CONFIG_X86_32 - return 0; -#else - unsigned long addr, end; - unsigned offset; - - /* - * Round up the start address. It can start out unaligned as a result - * of stack start randomization. - */ - start = PAGE_ALIGN(start); - - /* Round the lowest possible end address up to a PMD boundary. */ - end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE_MAX) - end = TASK_SIZE_MAX; - end -= len; - - if (end > start) { - offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); - addr = start + (offset << PAGE_SHIFT); - } else { - addr = start; - } - - /* - * Forcibly align the final address in case we have a hardware - * issue that requires alignment for performance reasons. - */ - addr = align_vdso_addr(addr); - - return addr; -#endif -} - -static int map_vdso(const struct vdso_image *image, bool calculate_addr) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long addr, text_start; - int ret = 0; - static struct page *no_pages[] = {NULL}; - static struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .pages = no_pages, - }; - - if (calculate_addr) { - addr = vdso_addr(current->mm->start_stack, - image->size - image->sym_vvar_start); - } else { - addr = 0; - } - - down_write(&mm->mmap_sem); - - addr = get_unmapped_area(NULL, addr, - image->size - image->sym_vvar_start, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - - text_start = addr - image->sym_vvar_start; - current->mm->context.vdso = (void __user *)text_start; - - /* - * MAYWRITE to allow gdb to COW and set breakpoints - */ - vma = _install_special_mapping(mm, - text_start, - image->size, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &image->text_mapping); - - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto up_fail; - } - - vma = _install_special_mapping(mm, - addr, - -image->sym_vvar_start, - VM_READ|VM_MAYREAD, - &vvar_mapping); - - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto up_fail; - } - - if (image->sym_vvar_page) - ret = remap_pfn_range(vma, - text_start + image->sym_vvar_page, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - -#ifdef CONFIG_HPET_TIMER - if (hpet_address && image->sym_hpet_page) { - ret = io_remap_pfn_range(vma, - text_start + image->sym_hpet_page, - hpet_address >> PAGE_SHIFT, - PAGE_SIZE, - pgprot_noncached(PAGE_READONLY)); - - if (ret) - goto up_fail; - } -#endif - -up_fail: - if (ret) - current->mm->context.vdso = NULL; - - up_write(&mm->mmap_sem); - return ret; -} - -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) -static int load_vdso32(void) -{ - int ret; - - if (vdso32_enabled != 1) /* Other values all mean "disabled" */ - return 0; - - ret = map_vdso(selected_vdso32, false); - if (ret) - return ret; - - if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) - current_thread_info()->sysenter_return = - current->mm->context.vdso + - selected_vdso32->sym_VDSO32_SYSENTER_RETURN; - - return 0; -} -#endif - -#ifdef CONFIG_X86_64 -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) -{ - if (!vdso64_enabled) - return 0; - - return map_vdso(&vdso_image_64, true); -} - -#ifdef CONFIG_COMPAT -int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) -{ -#ifdef CONFIG_X86_X32_ABI - if (test_thread_flag(TIF_X32)) { - if (!vdso64_enabled) - return 0; - - return map_vdso(&vdso_image_x32, true); - } -#endif - - return load_vdso32(); -} -#endif -#else -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) -{ - return load_vdso32(); -} -#endif - -#ifdef CONFIG_X86_64 -static __init int vdso_setup(char *s) -{ - vdso64_enabled = simple_strtoul(s, NULL, 0); - return 0; -} -__setup("vdso=", vdso_setup); -#endif - -#ifdef CONFIG_X86_64 -static void vgetcpu_cpu_init(void *arg) -{ - int cpu = smp_processor_id(); - struct desc_struct d = { }; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* - * Store cpu number in limit so that it can be loaded - * quickly in user space in vgetcpu. (12 bits for the CPU - * and 8 bits for the node) - */ - d.limit0 = cpu | ((node & 0xf) << 12); - d.limit = node >> 4; - d.type = 5; /* RO data, expand down, accessed */ - d.dpl = 3; /* Visible to user code */ - d.s = 1; /* Not a system segment */ - d.p = 1; /* Present */ - d.d = 1; /* 32-bit */ - - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); -} - -static int -vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) -{ - long cpu = (long)arg; - - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); - - return NOTIFY_DONE; -} - -static int __init init_vdso(void) -{ - init_vdso_image(&vdso_image_64); - -#ifdef CONFIG_X86_X32_ABI - init_vdso_image(&vdso_image_x32); -#endif - - cpu_notifier_register_begin(); - - on_each_cpu(vgetcpu_cpu_init, NULL, 1); - /* notifier priority > KVM */ - __hotcpu_notifier(vgetcpu_cpu_notifier, 30); - - cpu_notifier_register_done(); - - return 0; -} -subsys_initcall(init_vdso); -#endif /* CONFIG_X86_64 */ -- cgit v1.2.1 From e6b93f4e48af8fe8373ec1132b8c7787890e7578 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:10:43 +0200 Subject: x86/asm/entry: Move the 'thunk' functions to arch/x86/entry/ These are all calling x86 entry code functions, so move them close to other entry code. Change lib-y to obj-y: there's no real difference between the two as we don't really drop any of them during the linking stage, and obj-y is the more common approach for core kernel object code. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 3 +-- arch/x86/entry/thunk_32.S | 42 +++++++++++++++++++++++++++++ arch/x86/entry/thunk_64.S | 69 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/lib/Makefile | 1 - arch/x86/lib/thunk_32.S | 42 ----------------------------- arch/x86/lib/thunk_64.S | 69 ----------------------------------------------- arch/x86/um/Makefile | 2 +- 7 files changed, 113 insertions(+), 115 deletions(-) create mode 100644 arch/x86/entry/thunk_32.S create mode 100644 arch/x86/entry/thunk_64.S delete mode 100644 arch/x86/lib/thunk_32.S delete mode 100644 arch/x86/lib/thunk_64.S (limited to 'arch/x86') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index c97532492ef5..9df72c8a0ac2 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,7 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o - +obj-y := entry_$(BITS).o thunk_$(BITS).o obj-y += vdso/ obj-$(CONFIG_IA32_EMULATION) += ia32entry.o diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S new file mode 100644 index 000000000000..e9acf5f4fc92 --- /dev/null +++ b/arch/x86/entry/thunk_32.S @@ -0,0 +1,42 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + #include + #include + + /* put return address in eax (arg1) */ + .macro THUNK name, func, put_ret_addr_in_eax=0 + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + + .if \put_ret_addr_in_eax + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + .endif + + call \func + popl %edx + popl %ecx + popl %eax + ret + _ASM_NOKPROBE(\name) + .endm + +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule +#ifdef CONFIG_CONTEXT_TRACKING + THUNK ___preempt_schedule_context, preempt_schedule_context +#endif +#endif + diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S new file mode 100644 index 000000000000..10f555e435e1 --- /dev/null +++ b/arch/x86/entry/thunk_64.S @@ -0,0 +1,69 @@ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ +#include +#include +#include + + /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ + .macro THUNK name, func, put_ret_addr_in_rdi=0 + .globl \name +\name: + + /* this one pushes 9 elems, the next one would be %rIP */ + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + + .if \put_ret_addr_in_rdi + /* 9*8(%rsp) is return addr on stack */ + movq 9*8(%rsp), %rdi + .endif + + call \func + jmp restore + _ASM_NOKPROBE(\name) + .endm + +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + THUNK lockdep_sys_exit_thunk,lockdep_sys_exit +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule +#ifdef CONFIG_CONTEXT_TRACKING + THUNK ___preempt_schedule_context, preempt_schedule_context +#endif +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) \ + || defined(CONFIG_DEBUG_LOCK_ALLOC) \ + || defined(CONFIG_PREEMPT) +restore: + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + ret + _ASM_NOKPROBE(restore) +#endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 982989d282ff..f2587888d987 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -17,7 +17,6 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o -lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S deleted file mode 100644 index e9acf5f4fc92..000000000000 --- a/arch/x86/lib/thunk_32.S +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) - * Copyright 2008 by Steven Rostedt, Red Hat, Inc - * (inspired by Andi Kleen's thunk_64.S) - * Subject to the GNU public license, v.2. No warranty of any kind. - */ - #include - #include - - /* put return address in eax (arg1) */ - .macro THUNK name, func, put_ret_addr_in_eax=0 - .globl \name -\name: - pushl %eax - pushl %ecx - pushl %edx - - .if \put_ret_addr_in_eax - /* Place EIP in the arg1 */ - movl 3*4(%esp), %eax - .endif - - call \func - popl %edx - popl %ecx - popl %eax - ret - _ASM_NOKPROBE(\name) - .endm - -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - -#ifdef CONFIG_PREEMPT - THUNK ___preempt_schedule, preempt_schedule -#ifdef CONFIG_CONTEXT_TRACKING - THUNK ___preempt_schedule_context, preempt_schedule_context -#endif -#endif - diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S deleted file mode 100644 index 10f555e435e1..000000000000 --- a/arch/x86/lib/thunk_64.S +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Save registers before calling assembly functions. This avoids - * disturbance of register allocation in some inline assembly constructs. - * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. - * Subject to the GNU public license, v.2. No warranty of any kind. - */ -#include -#include -#include - - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro THUNK name, func, put_ret_addr_in_rdi=0 - .globl \name -\name: - - /* this one pushes 9 elems, the next one would be %rIP */ - pushq %rdi - pushq %rsi - pushq %rdx - pushq %rcx - pushq %rax - pushq %r8 - pushq %r9 - pushq %r10 - pushq %r11 - - .if \put_ret_addr_in_rdi - /* 9*8(%rsp) is return addr on stack */ - movq 9*8(%rsp), %rdi - .endif - - call \func - jmp restore - _ASM_NOKPROBE(\name) - .endm - -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - THUNK lockdep_sys_exit_thunk,lockdep_sys_exit -#endif - -#ifdef CONFIG_PREEMPT - THUNK ___preempt_schedule, preempt_schedule -#ifdef CONFIG_CONTEXT_TRACKING - THUNK ___preempt_schedule_context, preempt_schedule_context -#endif -#endif - -#if defined(CONFIG_TRACE_IRQFLAGS) \ - || defined(CONFIG_DEBUG_LOCK_ALLOC) \ - || defined(CONFIG_PREEMPT) -restore: - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi - popq %rdi - ret - _ASM_NOKPROBE(restore) -#endif diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index acb384d24669..a8fecc226946 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -26,7 +26,7 @@ else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \ +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \ ../lib/rwsem.o endif -- cgit v1.2.1 From d36f947904dba0935a0e74dc9df2ef57caeb7d03 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:29:26 +0200 Subject: x86/asm/entry: Move arch/x86/include/asm/calling.h to arch/x86/entry/ asm/calling.h is private to the entry code, make this more apparent by moving it to the new arch/x86/entry/ directory. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 243 +++++++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_64.S | 2 +- arch/x86/entry/ia32entry.S | 2 +- arch/x86/entry/thunk_64.S | 2 +- arch/x86/include/asm/calling.h | 243 ----------------------------------------- 5 files changed, 246 insertions(+), 246 deletions(-) create mode 100644 arch/x86/entry/calling.h delete mode 100644 arch/x86/include/asm/calling.h (limited to 'arch/x86') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h new file mode 100644 index 000000000000..f4e6308c4200 --- /dev/null +++ b/arch/x86/entry/calling.h @@ -0,0 +1,243 @@ +/* + + x86 function call convention, 64-bit: + ------------------------------------- + arguments | callee-saved | extra caller-saved | return + [callee-clobbered] | | [callee-clobbered] | + --------------------------------------------------------------------------- + rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**] + + ( rsp is obviously invariant across normal function calls. (gcc can 'merge' + functions when it sees tail-call optimization possibilities) rflags is + clobbered. Leftover arguments are passed over the stack frame.) + + [*] In the frame-pointers case rbp is fixed to the stack frame. + + [**] for struct return values wider than 64 bits the return convention is a + bit more complex: up to 128 bits width we return small structures + straight in rax, rdx. For structures larger than that (3 words or + larger) the caller puts a pointer to an on-stack return struct + [allocated in the caller's stack frame] into the first argument - i.e. + into rdi. All other arguments shift up by one in this case. + Fortunately this case is rare in the kernel. + +For 32-bit we have the following conventions - kernel is built with +-mregparm=3 and -freg-struct-return: + + x86 function calling convention, 32-bit: + ---------------------------------------- + arguments | callee-saved | extra caller-saved | return + [callee-clobbered] | | [callee-clobbered] | + ------------------------------------------------------------------------- + eax edx ecx | ebx edi esi ebp [*] | | eax, edx [**] + + ( here too esp is obviously invariant across normal function calls. eflags + is clobbered. Leftover arguments are passed over the stack frame. ) + + [*] In the frame-pointers case ebp is fixed to the stack frame. + + [**] We build with -freg-struct-return, which on 32-bit means similar + semantics as on 64-bit: edx can be used for a second return value + (i.e. covering integer and structure sizes up to 64 bits) - after that + it gets more complex and more expensive: 3-word or larger struct returns + get done in the caller's frame and the pointer to the return struct goes + into regparm0, i.e. eax - the other arguments shift up and the + function's register parameters degenerate to regparm=2 in essence. + +*/ + +#ifdef CONFIG_X86_64 + +/* + * 64-bit system call stack frame layout defines and helpers, + * for assembly code: + */ + +/* The layout forms the "struct pt_regs" on the stack: */ +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ +#define R15 0*8 +#define R14 1*8 +#define R13 2*8 +#define R12 3*8 +#define RBP 4*8 +#define RBX 5*8 +/* These regs are callee-clobbered. Always saved on kernel entry. */ +#define R11 6*8 +#define R10 7*8 +#define R9 8*8 +#define R8 9*8 +#define RAX 10*8 +#define RCX 11*8 +#define RDX 12*8 +#define RSI 13*8 +#define RDI 14*8 +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 15*8 +/* Return frame for iretq */ +#define RIP 16*8 +#define CS 17*8 +#define EFLAGS 18*8 +#define RSP 19*8 +#define SS 20*8 + +#define SIZEOF_PTREGS 21*8 + + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 + addq $-(15*8+\addskip), %rsp + .endm + + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 + .if \r11 + movq %r11, 6*8+\offset(%rsp) + .endif + .if \r8910 + movq %r10, 7*8+\offset(%rsp) + movq %r9, 8*8+\offset(%rsp) + movq %r8, 9*8+\offset(%rsp) + .endif + .if \rax + movq %rax, 10*8+\offset(%rsp) + .endif + .if \rcx + movq %rcx, 11*8+\offset(%rsp) + .endif + movq %rdx, 12*8+\offset(%rsp) + movq %rsi, 13*8+\offset(%rsp) + movq %rdi, 14*8+\offset(%rsp) + .endm + .macro SAVE_C_REGS offset=0 + SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 + SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_R891011 + SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RCX_R891011 + SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 + SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 + .endm + + .macro SAVE_EXTRA_REGS offset=0 + movq %r15, 0*8+\offset(%rsp) + movq %r14, 1*8+\offset(%rsp) + movq %r13, 2*8+\offset(%rsp) + movq %r12, 3*8+\offset(%rsp) + movq %rbp, 4*8+\offset(%rsp) + movq %rbx, 5*8+\offset(%rsp) + .endm + .macro SAVE_EXTRA_REGS_RBP offset=0 + movq %rbp, 4*8+\offset(%rsp) + .endm + + .macro RESTORE_EXTRA_REGS offset=0 + movq 0*8+\offset(%rsp), %r15 + movq 1*8+\offset(%rsp), %r14 + movq 2*8+\offset(%rsp), %r13 + movq 3*8+\offset(%rsp), %r12 + movq 4*8+\offset(%rsp), %rbp + movq 5*8+\offset(%rsp), %rbx + .endm + + .macro ZERO_EXTRA_REGS + xorl %r15d, %r15d + xorl %r14d, %r14d + xorl %r13d, %r13d + xorl %r12d, %r12d + xorl %ebp, %ebp + xorl %ebx, %ebx + .endm + + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 + .if \rstor_r11 + movq 6*8(%rsp), %r11 + .endif + .if \rstor_r8910 + movq 7*8(%rsp), %r10 + movq 8*8(%rsp), %r9 + movq 9*8(%rsp), %r8 + .endif + .if \rstor_rax + movq 10*8(%rsp), %rax + .endif + .if \rstor_rcx + movq 11*8(%rsp), %rcx + .endif + .if \rstor_rdx + movq 12*8(%rsp), %rdx + .endif + movq 13*8(%rsp), %rsi + movq 14*8(%rsp), %rdi + .endm + .macro RESTORE_C_REGS + RESTORE_C_REGS_HELPER 1,1,1,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RAX + RESTORE_C_REGS_HELPER 0,1,1,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX + RESTORE_C_REGS_HELPER 1,0,1,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_R11 + RESTORE_C_REGS_HELPER 1,1,0,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX_R11 + RESTORE_C_REGS_HELPER 1,0,0,1,1 + .endm + .macro RESTORE_RSI_RDI + RESTORE_C_REGS_HELPER 0,0,0,0,0 + .endm + .macro RESTORE_RSI_RDI_RDX + RESTORE_C_REGS_HELPER 0,0,0,0,1 + .endm + + .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 + subq $-(15*8+\addskip), %rsp + .endm + + .macro icebp + .byte 0xf1 + .endm + +#else /* CONFIG_X86_64 */ + +/* + * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These + * are different from the entry_32.S versions in not changing the segment + * registers. So only suitable for in kernel use, not when transitioning + * from or to user space. The resulting stack frame is not a standard + * pt_regs frame. The main use case is calling C code from assembler + * when all the registers need to be preserved. + */ + + .macro SAVE_ALL + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + .endm + + .macro RESTORE_ALL + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + .endm + +#endif /* CONFIG_X86_64 */ + diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4ad79e946f5a..f7380ea75777 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -28,7 +28,7 @@ #include #include #include -#include +#include "calling.h" #include #include #include diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 2be23c734db5..f16767417385 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -4,7 +4,7 @@ * Copyright 2000-2002 Andi Kleen, SuSE Labs. */ -#include +#include "calling.h" #include #include #include diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 10f555e435e1..3e95681b4e2d 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -6,7 +6,7 @@ * Subject to the GNU public license, v.2. No warranty of any kind. */ #include -#include +#include "calling.h" #include /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h deleted file mode 100644 index f4e6308c4200..000000000000 --- a/arch/x86/include/asm/calling.h +++ /dev/null @@ -1,243 +0,0 @@ -/* - - x86 function call convention, 64-bit: - ------------------------------------- - arguments | callee-saved | extra caller-saved | return - [callee-clobbered] | | [callee-clobbered] | - --------------------------------------------------------------------------- - rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**] - - ( rsp is obviously invariant across normal function calls. (gcc can 'merge' - functions when it sees tail-call optimization possibilities) rflags is - clobbered. Leftover arguments are passed over the stack frame.) - - [*] In the frame-pointers case rbp is fixed to the stack frame. - - [**] for struct return values wider than 64 bits the return convention is a - bit more complex: up to 128 bits width we return small structures - straight in rax, rdx. For structures larger than that (3 words or - larger) the caller puts a pointer to an on-stack return struct - [allocated in the caller's stack frame] into the first argument - i.e. - into rdi. All other arguments shift up by one in this case. - Fortunately this case is rare in the kernel. - -For 32-bit we have the following conventions - kernel is built with --mregparm=3 and -freg-struct-return: - - x86 function calling convention, 32-bit: - ---------------------------------------- - arguments | callee-saved | extra caller-saved | return - [callee-clobbered] | | [callee-clobbered] | - ------------------------------------------------------------------------- - eax edx ecx | ebx edi esi ebp [*] | | eax, edx [**] - - ( here too esp is obviously invariant across normal function calls. eflags - is clobbered. Leftover arguments are passed over the stack frame. ) - - [*] In the frame-pointers case ebp is fixed to the stack frame. - - [**] We build with -freg-struct-return, which on 32-bit means similar - semantics as on 64-bit: edx can be used for a second return value - (i.e. covering integer and structure sizes up to 64 bits) - after that - it gets more complex and more expensive: 3-word or larger struct returns - get done in the caller's frame and the pointer to the return struct goes - into regparm0, i.e. eax - the other arguments shift up and the - function's register parameters degenerate to regparm=2 in essence. - -*/ - -#ifdef CONFIG_X86_64 - -/* - * 64-bit system call stack frame layout defines and helpers, - * for assembly code: - */ - -/* The layout forms the "struct pt_regs" on the stack: */ -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ -#define R15 0*8 -#define R14 1*8 -#define R13 2*8 -#define R12 3*8 -#define RBP 4*8 -#define RBX 5*8 -/* These regs are callee-clobbered. Always saved on kernel entry. */ -#define R11 6*8 -#define R10 7*8 -#define R9 8*8 -#define R8 9*8 -#define RAX 10*8 -#define RCX 11*8 -#define RDX 12*8 -#define RSI 13*8 -#define RDI 14*8 -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ -#define ORIG_RAX 15*8 -/* Return frame for iretq */ -#define RIP 16*8 -#define CS 17*8 -#define EFLAGS 18*8 -#define RSP 19*8 -#define SS 20*8 - -#define SIZEOF_PTREGS 21*8 - - .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - addq $-(15*8+\addskip), %rsp - .endm - - .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 - .if \r11 - movq %r11, 6*8+\offset(%rsp) - .endif - .if \r8910 - movq %r10, 7*8+\offset(%rsp) - movq %r9, 8*8+\offset(%rsp) - movq %r8, 9*8+\offset(%rsp) - .endif - .if \rax - movq %rax, 10*8+\offset(%rsp) - .endif - .if \rcx - movq %rcx, 11*8+\offset(%rsp) - .endif - movq %rdx, 12*8+\offset(%rsp) - movq %rsi, 13*8+\offset(%rsp) - movq %rdi, 14*8+\offset(%rsp) - .endm - .macro SAVE_C_REGS offset=0 - SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 - .endm - .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 - SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 - .endm - .macro SAVE_C_REGS_EXCEPT_R891011 - SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 - .endm - .macro SAVE_C_REGS_EXCEPT_RCX_R891011 - SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 - .endm - .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 - SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 - .endm - - .macro SAVE_EXTRA_REGS offset=0 - movq %r15, 0*8+\offset(%rsp) - movq %r14, 1*8+\offset(%rsp) - movq %r13, 2*8+\offset(%rsp) - movq %r12, 3*8+\offset(%rsp) - movq %rbp, 4*8+\offset(%rsp) - movq %rbx, 5*8+\offset(%rsp) - .endm - .macro SAVE_EXTRA_REGS_RBP offset=0 - movq %rbp, 4*8+\offset(%rsp) - .endm - - .macro RESTORE_EXTRA_REGS offset=0 - movq 0*8+\offset(%rsp), %r15 - movq 1*8+\offset(%rsp), %r14 - movq 2*8+\offset(%rsp), %r13 - movq 3*8+\offset(%rsp), %r12 - movq 4*8+\offset(%rsp), %rbp - movq 5*8+\offset(%rsp), %rbx - .endm - - .macro ZERO_EXTRA_REGS - xorl %r15d, %r15d - xorl %r14d, %r14d - xorl %r13d, %r13d - xorl %r12d, %r12d - xorl %ebp, %ebp - xorl %ebx, %ebx - .endm - - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 - .if \rstor_r11 - movq 6*8(%rsp), %r11 - .endif - .if \rstor_r8910 - movq 7*8(%rsp), %r10 - movq 8*8(%rsp), %r9 - movq 9*8(%rsp), %r8 - .endif - .if \rstor_rax - movq 10*8(%rsp), %rax - .endif - .if \rstor_rcx - movq 11*8(%rsp), %rcx - .endif - .if \rstor_rdx - movq 12*8(%rsp), %rdx - .endif - movq 13*8(%rsp), %rsi - movq 14*8(%rsp), %rdi - .endm - .macro RESTORE_C_REGS - RESTORE_C_REGS_HELPER 1,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RAX - RESTORE_C_REGS_HELPER 0,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX - RESTORE_C_REGS_HELPER 1,0,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_R11 - RESTORE_C_REGS_HELPER 1,1,0,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX_R11 - RESTORE_C_REGS_HELPER 1,0,0,1,1 - .endm - .macro RESTORE_RSI_RDI - RESTORE_C_REGS_HELPER 0,0,0,0,0 - .endm - .macro RESTORE_RSI_RDI_RDX - RESTORE_C_REGS_HELPER 0,0,0,0,1 - .endm - - .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - subq $-(15*8+\addskip), %rsp - .endm - - .macro icebp - .byte 0xf1 - .endm - -#else /* CONFIG_X86_64 */ - -/* - * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These - * are different from the entry_32.S versions in not changing the segment - * registers. So only suitable for in kernel use, not when transitioning - * from or to user space. The resulting stack frame is not a standard - * pt_regs frame. The main use case is calling C code from assembler - * when all the registers need to be preserved. - */ - - .macro SAVE_ALL - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - .endm - - .macro RESTORE_ALL - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - .endm - -#endif /* CONFIG_X86_64 */ - -- cgit v1.2.1 From 1f57d5d85ba7f1f467173ff33f51d01a91f9aaf1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:36:41 +0200 Subject: x86/asm/entry: Move the arch/x86/syscalls/ definitions to arch/x86/entry/syscalls/ The build time generated syscall definitions are entry code related, move them into the arch/x86/entry/ directory. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 2 +- arch/x86/entry/syscalls/Makefile | 69 ++++++ arch/x86/entry/syscalls/syscall_32.tbl | 367 ++++++++++++++++++++++++++++++++ arch/x86/entry/syscalls/syscall_64.tbl | 370 +++++++++++++++++++++++++++++++++ arch/x86/entry/syscalls/syscallhdr.sh | 27 +++ arch/x86/entry/syscalls/syscalltbl.sh | 15 ++ arch/x86/syscalls/Makefile | 69 ------ arch/x86/syscalls/syscall_32.tbl | 367 -------------------------------- arch/x86/syscalls/syscall_64.tbl | 370 --------------------------------- arch/x86/syscalls/syscallhdr.sh | 27 --- arch/x86/syscalls/syscalltbl.sh | 15 -- 11 files changed, 849 insertions(+), 849 deletions(-) create mode 100644 arch/x86/entry/syscalls/Makefile create mode 100644 arch/x86/entry/syscalls/syscall_32.tbl create mode 100644 arch/x86/entry/syscalls/syscall_64.tbl create mode 100644 arch/x86/entry/syscalls/syscallhdr.sh create mode 100644 arch/x86/entry/syscalls/syscalltbl.sh delete mode 100644 arch/x86/syscalls/Makefile delete mode 100644 arch/x86/syscalls/syscall_32.tbl delete mode 100644 arch/x86/syscalls/syscall_64.tbl delete mode 100644 arch/x86/syscalls/syscallhdr.sh delete mode 100644 arch/x86/syscalls/syscalltbl.sh (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 90b1c3bfec46..118e6debc483 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -181,7 +181,7 @@ archscripts: scripts_basic # Syscall table generation archheaders: - $(Q)$(MAKE) $(build)=arch/x86/syscalls all + $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all archprepare: ifeq ($(CONFIG_KEXEC_FILE),y) diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile new file mode 100644 index 000000000000..57aa59fd140c --- /dev/null +++ b/arch/x86/entry/syscalls/Makefile @@ -0,0 +1,69 @@ +out := $(obj)/../../include/generated/asm +uapi := $(obj)/../../include/generated/uapi/asm + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ + $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') + +syscall32 := $(srctree)/$(src)/syscall_32.tbl +syscall64 := $(srctree)/$(src)/syscall_64.tbl + +syshdr := $(srctree)/$(src)/syscallhdr.sh +systbl := $(srctree)/$(src)/syscalltbl.sh + +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ + '$(syshdr_abi_$(basetarget))' \ + '$(syshdr_pfx_$(basetarget))' \ + '$(syshdr_offset_$(basetarget))' +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + +quiet_cmd_hypercalls = HYPERCALLS $@ + cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^) + +syshdr_abi_unistd_32 := i386 +$(uapi)/unistd_32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_32_ia32 := i386 +syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_x32 := common,x32 +syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT +$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64 := common,64 +$(uapi)/unistd_64.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64_x32 := x32 +syshdr_pfx_unistd_64_x32 := x32_ +$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +$(out)/syscalls_32.h: $(syscall32) $(systbl) + $(call if_changed,systbl) +$(out)/syscalls_64.h: $(syscall64) $(systbl) + $(call if_changed,systbl) + +$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh + $(call if_changed,hypercalls) + +$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h + +uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h +syshdr-y += syscalls_32.h +syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h +syshdr-$(CONFIG_X86_64) += syscalls_64.h +syshdr-$(CONFIG_XEN) += xen-hypercalls.h + +targets += $(uapisyshdr-y) $(syshdr-y) + +PHONY += all +all: $(addprefix $(uapi)/,$(uapisyshdr-y)) +all: $(addprefix $(out)/,$(syshdr-y)) + @: diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl new file mode 100644 index 000000000000..ef8187f9d28d --- /dev/null +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -0,0 +1,367 @@ +# +# 32-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is always "i386" for this file. +# +0 i386 restart_syscall sys_restart_syscall +1 i386 exit sys_exit +2 i386 fork sys_fork stub32_fork +3 i386 read sys_read +4 i386 write sys_write +5 i386 open sys_open compat_sys_open +6 i386 close sys_close +7 i386 waitpid sys_waitpid sys32_waitpid +8 i386 creat sys_creat +9 i386 link sys_link +10 i386 unlink sys_unlink +11 i386 execve sys_execve stub32_execve +12 i386 chdir sys_chdir +13 i386 time sys_time compat_sys_time +14 i386 mknod sys_mknod +15 i386 chmod sys_chmod +16 i386 lchown sys_lchown16 +17 i386 break +18 i386 oldstat sys_stat +19 i386 lseek sys_lseek compat_sys_lseek +20 i386 getpid sys_getpid +21 i386 mount sys_mount compat_sys_mount +22 i386 umount sys_oldumount +23 i386 setuid sys_setuid16 +24 i386 getuid sys_getuid16 +25 i386 stime sys_stime compat_sys_stime +26 i386 ptrace sys_ptrace compat_sys_ptrace +27 i386 alarm sys_alarm +28 i386 oldfstat sys_fstat +29 i386 pause sys_pause +30 i386 utime sys_utime compat_sys_utime +31 i386 stty +32 i386 gtty +33 i386 access sys_access +34 i386 nice sys_nice +35 i386 ftime +36 i386 sync sys_sync +37 i386 kill sys_kill +38 i386 rename sys_rename +39 i386 mkdir sys_mkdir +40 i386 rmdir sys_rmdir +41 i386 dup sys_dup +42 i386 pipe sys_pipe +43 i386 times sys_times compat_sys_times +44 i386 prof +45 i386 brk sys_brk +46 i386 setgid sys_setgid16 +47 i386 getgid sys_getgid16 +48 i386 signal sys_signal +49 i386 geteuid sys_geteuid16 +50 i386 getegid sys_getegid16 +51 i386 acct sys_acct +52 i386 umount2 sys_umount +53 i386 lock +54 i386 ioctl sys_ioctl compat_sys_ioctl +55 i386 fcntl sys_fcntl compat_sys_fcntl64 +56 i386 mpx +57 i386 setpgid sys_setpgid +58 i386 ulimit +59 i386 oldolduname sys_olduname +60 i386 umask sys_umask +61 i386 chroot sys_chroot +62 i386 ustat sys_ustat compat_sys_ustat +63 i386 dup2 sys_dup2 +64 i386 getppid sys_getppid +65 i386 getpgrp sys_getpgrp +66 i386 setsid sys_setsid +67 i386 sigaction sys_sigaction compat_sys_sigaction +68 i386 sgetmask sys_sgetmask +69 i386 ssetmask sys_ssetmask +70 i386 setreuid sys_setreuid16 +71 i386 setregid sys_setregid16 +72 i386 sigsuspend sys_sigsuspend sys_sigsuspend +73 i386 sigpending sys_sigpending compat_sys_sigpending +74 i386 sethostname sys_sethostname +75 i386 setrlimit sys_setrlimit compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 +81 i386 setgroups sys_setgroups16 +82 i386 select sys_old_select compat_sys_old_select +83 i386 symlink sys_symlink +84 i386 oldlstat sys_lstat +85 i386 readlink sys_readlink +86 i386 uselib sys_uselib +87 i386 swapon sys_swapon +88 i386 reboot sys_reboot +89 i386 readdir sys_old_readdir compat_sys_old_readdir +90 i386 mmap sys_old_mmap sys32_mmap +91 i386 munmap sys_munmap +92 i386 truncate sys_truncate compat_sys_truncate +93 i386 ftruncate sys_ftruncate compat_sys_ftruncate +94 i386 fchmod sys_fchmod +95 i386 fchown sys_fchown16 +96 i386 getpriority sys_getpriority +97 i386 setpriority sys_setpriority +98 i386 profil +99 i386 statfs sys_statfs compat_sys_statfs +100 i386 fstatfs sys_fstatfs compat_sys_fstatfs +101 i386 ioperm sys_ioperm +102 i386 socketcall sys_socketcall compat_sys_socketcall +103 i386 syslog sys_syslog +104 i386 setitimer sys_setitimer compat_sys_setitimer +105 i386 getitimer sys_getitimer compat_sys_getitimer +106 i386 stat sys_newstat compat_sys_newstat +107 i386 lstat sys_newlstat compat_sys_newlstat +108 i386 fstat sys_newfstat compat_sys_newfstat +109 i386 olduname sys_uname +110 i386 iopl sys_iopl +111 i386 vhangup sys_vhangup +112 i386 idle +113 i386 vm86old sys_vm86old sys_ni_syscall +114 i386 wait4 sys_wait4 compat_sys_wait4 +115 i386 swapoff sys_swapoff +116 i386 sysinfo sys_sysinfo compat_sys_sysinfo +117 i386 ipc sys_ipc compat_sys_ipc +118 i386 fsync sys_fsync +119 i386 sigreturn sys_sigreturn stub32_sigreturn +120 i386 clone sys_clone stub32_clone +121 i386 setdomainname sys_setdomainname +122 i386 uname sys_newuname +123 i386 modify_ldt sys_modify_ldt +124 i386 adjtimex sys_adjtimex compat_sys_adjtimex +125 i386 mprotect sys_mprotect +126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 i386 create_module +128 i386 init_module sys_init_module +129 i386 delete_module sys_delete_module +130 i386 get_kernel_syms +131 i386 quotactl sys_quotactl sys32_quotactl +132 i386 getpgid sys_getpgid +133 i386 fchdir sys_fchdir +134 i386 bdflush sys_bdflush +135 i386 sysfs sys_sysfs +136 i386 personality sys_personality +137 i386 afs_syscall +138 i386 setfsuid sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 +140 i386 _llseek sys_llseek +141 i386 getdents sys_getdents compat_sys_getdents +142 i386 _newselect sys_select compat_sys_select +143 i386 flock sys_flock +144 i386 msync sys_msync +145 i386 readv sys_readv compat_sys_readv +146 i386 writev sys_writev compat_sys_writev +147 i386 getsid sys_getsid +148 i386 fdatasync sys_fdatasync +149 i386 _sysctl sys_sysctl compat_sys_sysctl +150 i386 mlock sys_mlock +151 i386 munlock sys_munlock +152 i386 mlockall sys_mlockall +153 i386 munlockall sys_munlockall +154 i386 sched_setparam sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval +162 i386 nanosleep sys_nanosleep compat_sys_nanosleep +163 i386 mremap sys_mremap +164 i386 setresuid sys_setresuid16 +165 i386 getresuid sys_getresuid16 +166 i386 vm86 sys_vm86 sys_ni_syscall +167 i386 query_module +168 i386 poll sys_poll +169 i386 nfsservctl +170 i386 setresgid sys_setresgid16 +171 i386 getresgid sys_getresgid16 +172 i386 prctl sys_prctl +173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend +180 i386 pread64 sys_pread64 sys32_pread +181 i386 pwrite64 sys_pwrite64 sys32_pwrite +182 i386 chown sys_chown16 +183 i386 getcwd sys_getcwd +184 i386 capget sys_capget +185 i386 capset sys_capset +186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack +187 i386 sendfile sys_sendfile compat_sys_sendfile +188 i386 getpmsg +189 i386 putpmsg +190 i386 vfork sys_vfork stub32_vfork +191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff +193 i386 truncate64 sys_truncate64 sys32_truncate64 +194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64 +195 i386 stat64 sys_stat64 sys32_stat64 +196 i386 lstat64 sys_lstat64 sys32_lstat64 +197 i386 fstat64 sys_fstat64 sys32_fstat64 +198 i386 lchown32 sys_lchown +199 i386 getuid32 sys_getuid +200 i386 getgid32 sys_getgid +201 i386 geteuid32 sys_geteuid +202 i386 getegid32 sys_getegid +203 i386 setreuid32 sys_setreuid +204 i386 setregid32 sys_setregid +205 i386 getgroups32 sys_getgroups +206 i386 setgroups32 sys_setgroups +207 i386 fchown32 sys_fchown +208 i386 setresuid32 sys_setresuid +209 i386 getresuid32 sys_getresuid +210 i386 setresgid32 sys_setresgid +211 i386 getresgid32 sys_getresgid +212 i386 chown32 sys_chown +213 i386 setuid32 sys_setuid +214 i386 setgid32 sys_setgid +215 i386 setfsuid32 sys_setfsuid +216 i386 setfsgid32 sys_setfsgid +217 i386 pivot_root sys_pivot_root +218 i386 mincore sys_mincore +219 i386 madvise sys_madvise +220 i386 getdents64 sys_getdents64 compat_sys_getdents64 +221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 +# 222 is unused +# 223 is unused +224 i386 gettid sys_gettid +225 i386 readahead sys_readahead sys32_readahead +226 i386 setxattr sys_setxattr +227 i386 lsetxattr sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr +229 i386 getxattr sys_getxattr +230 i386 lgetxattr sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr +232 i386 listxattr sys_listxattr +233 i386 llistxattr sys_llistxattr +234 i386 flistxattr sys_flistxattr +235 i386 removexattr sys_removexattr +236 i386 lremovexattr sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr +238 i386 tkill sys_tkill +239 i386 sendfile64 sys_sendfile64 +240 i386 futex sys_futex compat_sys_futex +241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area +245 i386 io_setup sys_io_setup compat_sys_io_setup +246 i386 io_destroy sys_io_destroy +247 i386 io_getevents sys_io_getevents compat_sys_io_getevents +248 i386 io_submit sys_io_submit compat_sys_io_submit +249 i386 io_cancel sys_io_cancel +250 i386 fadvise64 sys_fadvise64 sys32_fadvise64 +# 251 is available for reuse (was briefly sys_set_zone_reclaim) +252 i386 exit_group sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +254 i386 epoll_create sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address +259 i386 timer_create sys_timer_create compat_sys_timer_create +260 i386 timer_settime sys_timer_settime compat_sys_timer_settime +261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime +262 i386 timer_getoverrun sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete +264 i386 clock_settime sys_clock_settime compat_sys_clock_settime +265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime +266 i386 clock_getres sys_clock_getres compat_sys_clock_getres +267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +268 i386 statfs64 sys_statfs64 compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill +271 i386 utimes sys_utimes compat_sys_utimes +272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 +273 i386 vserver +274 i386 mbind sys_mbind +275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy +277 i386 mq_open sys_mq_open compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend +280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +281 i386 mq_notify sys_mq_notify compat_sys_mq_notify +282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load compat_sys_kexec_load +284 i386 waitid sys_waitid compat_sys_waitid +# 285 sys_setaltroot +286 i386 add_key sys_add_key +287 i386 request_key sys_request_key +288 i386 keyctl sys_keyctl +289 i386 ioprio_set sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get +291 i386 inotify_init sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages +295 i386 openat sys_openat compat_sys_openat +296 i386 mkdirat sys_mkdirat +297 i386 mknodat sys_mknodat +298 i386 fchownat sys_fchownat +299 i386 futimesat sys_futimesat compat_sys_futimesat +300 i386 fstatat64 sys_fstatat64 sys32_fstatat +301 i386 unlinkat sys_unlinkat +302 i386 renameat sys_renameat +303 i386 linkat sys_linkat +304 i386 symlinkat sys_symlinkat +305 i386 readlinkat sys_readlinkat +306 i386 fchmodat sys_fchmodat +307 i386 faccessat sys_faccessat +308 i386 pselect6 sys_pselect6 compat_sys_pselect6 +309 i386 ppoll sys_ppoll compat_sys_ppoll +310 i386 unshare sys_unshare +311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list +313 i386 splice sys_splice +314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range +315 i386 tee sys_tee +316 i386 vmsplice sys_vmsplice compat_sys_vmsplice +317 i386 move_pages sys_move_pages compat_sys_move_pages +318 i386 getcpu sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait +320 i386 utimensat sys_utimensat compat_sys_utimensat +321 i386 signalfd sys_signalfd compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create +323 i386 eventfd sys_eventfd +324 i386 fallocate sys_fallocate sys32_fallocate +325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 +330 i386 dup3 sys_dup3 +331 i386 pipe2 sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 +333 i386 preadv sys_preadv compat_sys_preadv +334 i386 pwritev sys_pwritev compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg +338 i386 fanotify_init sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark +340 i386 prlimit64 sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +344 i386 syncfs sys_syncfs +345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg +346 i386 setns sys_setns +347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp +350 i386 finit_module sys_finit_module +351 i386 sched_setattr sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 +354 i386 seccomp sys_seccomp +355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf +358 i386 execveat sys_execveat stub32_execveat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl new file mode 100644 index 000000000000..9ef32d5f1b19 --- /dev/null +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -0,0 +1,370 @@ +# +# 64-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is "common", "64" or "x32" for this file. +# +0 common read sys_read +1 common write sys_write +2 common open sys_open +3 common close sys_close +4 common stat sys_newstat +5 common fstat sys_newfstat +6 common lstat sys_newlstat +7 common poll sys_poll +8 common lseek sys_lseek +9 common mmap sys_mmap +10 common mprotect sys_mprotect +11 common munmap sys_munmap +12 common brk sys_brk +13 64 rt_sigaction sys_rt_sigaction +14 common rt_sigprocmask sys_rt_sigprocmask +15 64 rt_sigreturn stub_rt_sigreturn +16 64 ioctl sys_ioctl +17 common pread64 sys_pread64 +18 common pwrite64 sys_pwrite64 +19 64 readv sys_readv +20 64 writev sys_writev +21 common access sys_access +22 common pipe sys_pipe +23 common select sys_select +24 common sched_yield sys_sched_yield +25 common mremap sys_mremap +26 common msync sys_msync +27 common mincore sys_mincore +28 common madvise sys_madvise +29 common shmget sys_shmget +30 common shmat sys_shmat +31 common shmctl sys_shmctl +32 common dup sys_dup +33 common dup2 sys_dup2 +34 common pause sys_pause +35 common nanosleep sys_nanosleep +36 common getitimer sys_getitimer +37 common alarm sys_alarm +38 common setitimer sys_setitimer +39 common getpid sys_getpid +40 common sendfile sys_sendfile64 +41 common socket sys_socket +42 common connect sys_connect +43 common accept sys_accept +44 common sendto sys_sendto +45 64 recvfrom sys_recvfrom +46 64 sendmsg sys_sendmsg +47 64 recvmsg sys_recvmsg +48 common shutdown sys_shutdown +49 common bind sys_bind +50 common listen sys_listen +51 common getsockname sys_getsockname +52 common getpeername sys_getpeername +53 common socketpair sys_socketpair +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt +56 common clone stub_clone +57 common fork stub_fork +58 common vfork stub_vfork +59 64 execve stub_execve +60 common exit sys_exit +61 common wait4 sys_wait4 +62 common kill sys_kill +63 common uname sys_newuname +64 common semget sys_semget +65 common semop sys_semop +66 common semctl sys_semctl +67 common shmdt sys_shmdt +68 common msgget sys_msgget +69 common msgsnd sys_msgsnd +70 common msgrcv sys_msgrcv +71 common msgctl sys_msgctl +72 common fcntl sys_fcntl +73 common flock sys_flock +74 common fsync sys_fsync +75 common fdatasync sys_fdatasync +76 common truncate sys_truncate +77 common ftruncate sys_ftruncate +78 common getdents sys_getdents +79 common getcwd sys_getcwd +80 common chdir sys_chdir +81 common fchdir sys_fchdir +82 common rename sys_rename +83 common mkdir sys_mkdir +84 common rmdir sys_rmdir +85 common creat sys_creat +86 common link sys_link +87 common unlink sys_unlink +88 common symlink sys_symlink +89 common readlink sys_readlink +90 common chmod sys_chmod +91 common fchmod sys_fchmod +92 common chown sys_chown +93 common fchown sys_fchown +94 common lchown sys_lchown +95 common umask sys_umask +96 common gettimeofday sys_gettimeofday +97 common getrlimit sys_getrlimit +98 common getrusage sys_getrusage +99 common sysinfo sys_sysinfo +100 common times sys_times +101 64 ptrace sys_ptrace +102 common getuid sys_getuid +103 common syslog sys_syslog +104 common getgid sys_getgid +105 common setuid sys_setuid +106 common setgid sys_setgid +107 common geteuid sys_geteuid +108 common getegid sys_getegid +109 common setpgid sys_setpgid +110 common getppid sys_getppid +111 common getpgrp sys_getpgrp +112 common setsid sys_setsid +113 common setreuid sys_setreuid +114 common setregid sys_setregid +115 common getgroups sys_getgroups +116 common setgroups sys_setgroups +117 common setresuid sys_setresuid +118 common getresuid sys_getresuid +119 common setresgid sys_setresgid +120 common getresgid sys_getresgid +121 common getpgid sys_getpgid +122 common setfsuid sys_setfsuid +123 common setfsgid sys_setfsgid +124 common getsid sys_getsid +125 common capget sys_capget +126 common capset sys_capset +127 64 rt_sigpending sys_rt_sigpending +128 64 rt_sigtimedwait sys_rt_sigtimedwait +129 64 rt_sigqueueinfo sys_rt_sigqueueinfo +130 common rt_sigsuspend sys_rt_sigsuspend +131 64 sigaltstack sys_sigaltstack +132 common utime sys_utime +133 common mknod sys_mknod +134 64 uselib +135 common personality sys_personality +136 common ustat sys_ustat +137 common statfs sys_statfs +138 common fstatfs sys_fstatfs +139 common sysfs sys_sysfs +140 common getpriority sys_getpriority +141 common setpriority sys_setpriority +142 common sched_setparam sys_sched_setparam +143 common sched_getparam sys_sched_getparam +144 common sched_setscheduler sys_sched_setscheduler +145 common sched_getscheduler sys_sched_getscheduler +146 common sched_get_priority_max sys_sched_get_priority_max +147 common sched_get_priority_min sys_sched_get_priority_min +148 common sched_rr_get_interval sys_sched_rr_get_interval +149 common mlock sys_mlock +150 common munlock sys_munlock +151 common mlockall sys_mlockall +152 common munlockall sys_munlockall +153 common vhangup sys_vhangup +154 common modify_ldt sys_modify_ldt +155 common pivot_root sys_pivot_root +156 64 _sysctl sys_sysctl +157 common prctl sys_prctl +158 common arch_prctl sys_arch_prctl +159 common adjtimex sys_adjtimex +160 common setrlimit sys_setrlimit +161 common chroot sys_chroot +162 common sync sys_sync +163 common acct sys_acct +164 common settimeofday sys_settimeofday +165 common mount sys_mount +166 common umount2 sys_umount +167 common swapon sys_swapon +168 common swapoff sys_swapoff +169 common reboot sys_reboot +170 common sethostname sys_sethostname +171 common setdomainname sys_setdomainname +172 common iopl sys_iopl +173 common ioperm sys_ioperm +174 64 create_module +175 common init_module sys_init_module +176 common delete_module sys_delete_module +177 64 get_kernel_syms +178 64 query_module +179 common quotactl sys_quotactl +180 64 nfsservctl +181 common getpmsg +182 common putpmsg +183 common afs_syscall +184 common tuxcall +185 common security +186 common gettid sys_gettid +187 common readahead sys_readahead +188 common setxattr sys_setxattr +189 common lsetxattr sys_lsetxattr +190 common fsetxattr sys_fsetxattr +191 common getxattr sys_getxattr +192 common lgetxattr sys_lgetxattr +193 common fgetxattr sys_fgetxattr +194 common listxattr sys_listxattr +195 common llistxattr sys_llistxattr +196 common flistxattr sys_flistxattr +197 common removexattr sys_removexattr +198 common lremovexattr sys_lremovexattr +199 common fremovexattr sys_fremovexattr +200 common tkill sys_tkill +201 common time sys_time +202 common futex sys_futex +203 common sched_setaffinity sys_sched_setaffinity +204 common sched_getaffinity sys_sched_getaffinity +205 64 set_thread_area +206 64 io_setup sys_io_setup +207 common io_destroy sys_io_destroy +208 common io_getevents sys_io_getevents +209 64 io_submit sys_io_submit +210 common io_cancel sys_io_cancel +211 64 get_thread_area +212 common lookup_dcookie sys_lookup_dcookie +213 common epoll_create sys_epoll_create +214 64 epoll_ctl_old +215 64 epoll_wait_old +216 common remap_file_pages sys_remap_file_pages +217 common getdents64 sys_getdents64 +218 common set_tid_address sys_set_tid_address +219 common restart_syscall sys_restart_syscall +220 common semtimedop sys_semtimedop +221 common fadvise64 sys_fadvise64 +222 64 timer_create sys_timer_create +223 common timer_settime sys_timer_settime +224 common timer_gettime sys_timer_gettime +225 common timer_getoverrun sys_timer_getoverrun +226 common timer_delete sys_timer_delete +227 common clock_settime sys_clock_settime +228 common clock_gettime sys_clock_gettime +229 common clock_getres sys_clock_getres +230 common clock_nanosleep sys_clock_nanosleep +231 common exit_group sys_exit_group +232 common epoll_wait sys_epoll_wait +233 common epoll_ctl sys_epoll_ctl +234 common tgkill sys_tgkill +235 common utimes sys_utimes +236 64 vserver +237 common mbind sys_mbind +238 common set_mempolicy sys_set_mempolicy +239 common get_mempolicy sys_get_mempolicy +240 common mq_open sys_mq_open +241 common mq_unlink sys_mq_unlink +242 common mq_timedsend sys_mq_timedsend +243 common mq_timedreceive sys_mq_timedreceive +244 64 mq_notify sys_mq_notify +245 common mq_getsetattr sys_mq_getsetattr +246 64 kexec_load sys_kexec_load +247 64 waitid sys_waitid +248 common add_key sys_add_key +249 common request_key sys_request_key +250 common keyctl sys_keyctl +251 common ioprio_set sys_ioprio_set +252 common ioprio_get sys_ioprio_get +253 common inotify_init sys_inotify_init +254 common inotify_add_watch sys_inotify_add_watch +255 common inotify_rm_watch sys_inotify_rm_watch +256 common migrate_pages sys_migrate_pages +257 common openat sys_openat +258 common mkdirat sys_mkdirat +259 common mknodat sys_mknodat +260 common fchownat sys_fchownat +261 common futimesat sys_futimesat +262 common newfstatat sys_newfstatat +263 common unlinkat sys_unlinkat +264 common renameat sys_renameat +265 common linkat sys_linkat +266 common symlinkat sys_symlinkat +267 common readlinkat sys_readlinkat +268 common fchmodat sys_fchmodat +269 common faccessat sys_faccessat +270 common pselect6 sys_pselect6 +271 common ppoll sys_ppoll +272 common unshare sys_unshare +273 64 set_robust_list sys_set_robust_list +274 64 get_robust_list sys_get_robust_list +275 common splice sys_splice +276 common tee sys_tee +277 common sync_file_range sys_sync_file_range +278 64 vmsplice sys_vmsplice +279 64 move_pages sys_move_pages +280 common utimensat sys_utimensat +281 common epoll_pwait sys_epoll_pwait +282 common signalfd sys_signalfd +283 common timerfd_create sys_timerfd_create +284 common eventfd sys_eventfd +285 common fallocate sys_fallocate +286 common timerfd_settime sys_timerfd_settime +287 common timerfd_gettime sys_timerfd_gettime +288 common accept4 sys_accept4 +289 common signalfd4 sys_signalfd4 +290 common eventfd2 sys_eventfd2 +291 common epoll_create1 sys_epoll_create1 +292 common dup3 sys_dup3 +293 common pipe2 sys_pipe2 +294 common inotify_init1 sys_inotify_init1 +295 64 preadv sys_preadv +296 64 pwritev sys_pwritev +297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +298 common perf_event_open sys_perf_event_open +299 64 recvmmsg sys_recvmmsg +300 common fanotify_init sys_fanotify_init +301 common fanotify_mark sys_fanotify_mark +302 common prlimit64 sys_prlimit64 +303 common name_to_handle_at sys_name_to_handle_at +304 common open_by_handle_at sys_open_by_handle_at +305 common clock_adjtime sys_clock_adjtime +306 common syncfs sys_syncfs +307 64 sendmmsg sys_sendmmsg +308 common setns sys_setns +309 common getcpu sys_getcpu +310 64 process_vm_readv sys_process_vm_readv +311 64 process_vm_writev sys_process_vm_writev +312 common kcmp sys_kcmp +313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr +316 common renameat2 sys_renameat2 +317 common seccomp sys_seccomp +318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf +322 64 execveat stub_execveat + +# +# x32-specific system call numbers start at 512 to avoid cache impact +# for native 64-bit operation. +# +512 x32 rt_sigaction compat_sys_rt_sigaction +513 x32 rt_sigreturn stub_x32_rt_sigreturn +514 x32 ioctl compat_sys_ioctl +515 x32 readv compat_sys_readv +516 x32 writev compat_sys_writev +517 x32 recvfrom compat_sys_recvfrom +518 x32 sendmsg compat_sys_sendmsg +519 x32 recvmsg compat_sys_recvmsg +520 x32 execve stub_x32_execve +521 x32 ptrace compat_sys_ptrace +522 x32 rt_sigpending compat_sys_rt_sigpending +523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo +525 x32 sigaltstack compat_sys_sigaltstack +526 x32 timer_create compat_sys_timer_create +527 x32 mq_notify compat_sys_mq_notify +528 x32 kexec_load compat_sys_kexec_load +529 x32 waitid compat_sys_waitid +530 x32 set_robust_list compat_sys_set_robust_list +531 x32 get_robust_list compat_sys_get_robust_list +532 x32 vmsplice compat_sys_vmsplice +533 x32 move_pages compat_sys_move_pages +534 x32 preadv compat_sys_preadv64 +535 x32 pwritev compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg compat_sys_recvmmsg +538 x32 sendmmsg compat_sys_sendmmsg +539 x32 process_vm_readv compat_sys_process_vm_readv +540 x32 process_vm_writev compat_sys_process_vm_writev +541 x32 setsockopt compat_sys_setsockopt +542 x32 getsockopt compat_sys_getsockopt +543 x32 io_setup compat_sys_io_setup +544 x32 io_submit compat_sys_io_submit +545 x32 execveat stub_x32_execveat diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh new file mode 100644 index 000000000000..31fd5f1f38f7 --- /dev/null +++ b/arch/x86/entry/syscalls/syscallhdr.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +in="$1" +out="$2" +my_abis=`echo "($3)" | tr ',' '|'` +prefix="$4" +offset="$5" + +fileguard=_ASM_X86_`basename "$out" | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( + echo "#ifndef ${fileguard}" + echo "#define ${fileguard} 1" + echo "" + + while read nr abi name entry ; do + if [ -z "$offset" ]; then + echo "#define __NR_${prefix}${name} $nr" + else + echo "#define __NR_${prefix}${name} ($offset + $nr)" + fi + done + + echo "" + echo "#endif /* ${fileguard} */" +) > "$out" diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh new file mode 100644 index 000000000000..0e7f8ec071e7 --- /dev/null +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +in="$1" +out="$2" + +grep '^[0-9]' "$in" | sort -n | ( + while read nr abi name entry compat; do + abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` + if [ -n "$compat" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $compat)" + elif [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $entry)" + fi + done +) > "$out" diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile deleted file mode 100644 index a55abb9f6c5e..000000000000 --- a/arch/x86/syscalls/Makefile +++ /dev/null @@ -1,69 +0,0 @@ -out := $(obj)/../include/generated/asm -uapi := $(obj)/../include/generated/uapi/asm - -# Create output directory if not already present -_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ - $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') - -syscall32 := $(srctree)/$(src)/syscall_32.tbl -syscall64 := $(srctree)/$(src)/syscall_64.tbl - -syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh - -quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abi_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' -quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ - -quiet_cmd_hypercalls = HYPERCALLS $@ - cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^) - -syshdr_abi_unistd_32 := i386 -$(uapi)/unistd_32.h: $(syscall32) $(syshdr) - $(call if_changed,syshdr) - -syshdr_abi_unistd_32_ia32 := i386 -syshdr_pfx_unistd_32_ia32 := ia32_ -$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) - $(call if_changed,syshdr) - -syshdr_abi_unistd_x32 := common,x32 -syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT -$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) - $(call if_changed,syshdr) - -syshdr_abi_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall64) $(syshdr) - $(call if_changed,syshdr) - -syshdr_abi_unistd_64_x32 := x32 -syshdr_pfx_unistd_64_x32 := x32_ -$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) - $(call if_changed,syshdr) - -$(out)/syscalls_32.h: $(syscall32) $(systbl) - $(call if_changed,systbl) -$(out)/syscalls_64.h: $(syscall64) $(systbl) - $(call if_changed,systbl) - -$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh - $(call if_changed,hypercalls) - -$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h - -uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h -syshdr-y += syscalls_32.h -syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h -syshdr-$(CONFIG_X86_64) += syscalls_64.h -syshdr-$(CONFIG_XEN) += xen-hypercalls.h - -targets += $(uapisyshdr-y) $(syshdr-y) - -PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(out)/,$(syshdr-y)) - @: diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl deleted file mode 100644 index ef8187f9d28d..000000000000 --- a/arch/x86/syscalls/syscall_32.tbl +++ /dev/null @@ -1,367 +0,0 @@ -# -# 32-bit system call numbers and entry vectors -# -# The format is: -# -# -# The abi is always "i386" for this file. -# -0 i386 restart_syscall sys_restart_syscall -1 i386 exit sys_exit -2 i386 fork sys_fork stub32_fork -3 i386 read sys_read -4 i386 write sys_write -5 i386 open sys_open compat_sys_open -6 i386 close sys_close -7 i386 waitpid sys_waitpid sys32_waitpid -8 i386 creat sys_creat -9 i386 link sys_link -10 i386 unlink sys_unlink -11 i386 execve sys_execve stub32_execve -12 i386 chdir sys_chdir -13 i386 time sys_time compat_sys_time -14 i386 mknod sys_mknod -15 i386 chmod sys_chmod -16 i386 lchown sys_lchown16 -17 i386 break -18 i386 oldstat sys_stat -19 i386 lseek sys_lseek compat_sys_lseek -20 i386 getpid sys_getpid -21 i386 mount sys_mount compat_sys_mount -22 i386 umount sys_oldumount -23 i386 setuid sys_setuid16 -24 i386 getuid sys_getuid16 -25 i386 stime sys_stime compat_sys_stime -26 i386 ptrace sys_ptrace compat_sys_ptrace -27 i386 alarm sys_alarm -28 i386 oldfstat sys_fstat -29 i386 pause sys_pause -30 i386 utime sys_utime compat_sys_utime -31 i386 stty -32 i386 gtty -33 i386 access sys_access -34 i386 nice sys_nice -35 i386 ftime -36 i386 sync sys_sync -37 i386 kill sys_kill -38 i386 rename sys_rename -39 i386 mkdir sys_mkdir -40 i386 rmdir sys_rmdir -41 i386 dup sys_dup -42 i386 pipe sys_pipe -43 i386 times sys_times compat_sys_times -44 i386 prof -45 i386 brk sys_brk -46 i386 setgid sys_setgid16 -47 i386 getgid sys_getgid16 -48 i386 signal sys_signal -49 i386 geteuid sys_geteuid16 -50 i386 getegid sys_getegid16 -51 i386 acct sys_acct -52 i386 umount2 sys_umount -53 i386 lock -54 i386 ioctl sys_ioctl compat_sys_ioctl -55 i386 fcntl sys_fcntl compat_sys_fcntl64 -56 i386 mpx -57 i386 setpgid sys_setpgid -58 i386 ulimit -59 i386 oldolduname sys_olduname -60 i386 umask sys_umask -61 i386 chroot sys_chroot -62 i386 ustat sys_ustat compat_sys_ustat -63 i386 dup2 sys_dup2 -64 i386 getppid sys_getppid -65 i386 getpgrp sys_getpgrp -66 i386 setsid sys_setsid -67 i386 sigaction sys_sigaction compat_sys_sigaction -68 i386 sgetmask sys_sgetmask -69 i386 ssetmask sys_ssetmask -70 i386 setreuid sys_setreuid16 -71 i386 setregid sys_setregid16 -72 i386 sigsuspend sys_sigsuspend sys_sigsuspend -73 i386 sigpending sys_sigpending compat_sys_sigpending -74 i386 sethostname sys_sethostname -75 i386 setrlimit sys_setrlimit compat_sys_setrlimit -76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit -77 i386 getrusage sys_getrusage compat_sys_getrusage -78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday -79 i386 settimeofday sys_settimeofday compat_sys_settimeofday -80 i386 getgroups sys_getgroups16 -81 i386 setgroups sys_setgroups16 -82 i386 select sys_old_select compat_sys_old_select -83 i386 symlink sys_symlink -84 i386 oldlstat sys_lstat -85 i386 readlink sys_readlink -86 i386 uselib sys_uselib -87 i386 swapon sys_swapon -88 i386 reboot sys_reboot -89 i386 readdir sys_old_readdir compat_sys_old_readdir -90 i386 mmap sys_old_mmap sys32_mmap -91 i386 munmap sys_munmap -92 i386 truncate sys_truncate compat_sys_truncate -93 i386 ftruncate sys_ftruncate compat_sys_ftruncate -94 i386 fchmod sys_fchmod -95 i386 fchown sys_fchown16 -96 i386 getpriority sys_getpriority -97 i386 setpriority sys_setpriority -98 i386 profil -99 i386 statfs sys_statfs compat_sys_statfs -100 i386 fstatfs sys_fstatfs compat_sys_fstatfs -101 i386 ioperm sys_ioperm -102 i386 socketcall sys_socketcall compat_sys_socketcall -103 i386 syslog sys_syslog -104 i386 setitimer sys_setitimer compat_sys_setitimer -105 i386 getitimer sys_getitimer compat_sys_getitimer -106 i386 stat sys_newstat compat_sys_newstat -107 i386 lstat sys_newlstat compat_sys_newlstat -108 i386 fstat sys_newfstat compat_sys_newfstat -109 i386 olduname sys_uname -110 i386 iopl sys_iopl -111 i386 vhangup sys_vhangup -112 i386 idle -113 i386 vm86old sys_vm86old sys_ni_syscall -114 i386 wait4 sys_wait4 compat_sys_wait4 -115 i386 swapoff sys_swapoff -116 i386 sysinfo sys_sysinfo compat_sys_sysinfo -117 i386 ipc sys_ipc compat_sys_ipc -118 i386 fsync sys_fsync -119 i386 sigreturn sys_sigreturn stub32_sigreturn -120 i386 clone sys_clone stub32_clone -121 i386 setdomainname sys_setdomainname -122 i386 uname sys_newuname -123 i386 modify_ldt sys_modify_ldt -124 i386 adjtimex sys_adjtimex compat_sys_adjtimex -125 i386 mprotect sys_mprotect -126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask -127 i386 create_module -128 i386 init_module sys_init_module -129 i386 delete_module sys_delete_module -130 i386 get_kernel_syms -131 i386 quotactl sys_quotactl sys32_quotactl -132 i386 getpgid sys_getpgid -133 i386 fchdir sys_fchdir -134 i386 bdflush sys_bdflush -135 i386 sysfs sys_sysfs -136 i386 personality sys_personality -137 i386 afs_syscall -138 i386 setfsuid sys_setfsuid16 -139 i386 setfsgid sys_setfsgid16 -140 i386 _llseek sys_llseek -141 i386 getdents sys_getdents compat_sys_getdents -142 i386 _newselect sys_select compat_sys_select -143 i386 flock sys_flock -144 i386 msync sys_msync -145 i386 readv sys_readv compat_sys_readv -146 i386 writev sys_writev compat_sys_writev -147 i386 getsid sys_getsid -148 i386 fdatasync sys_fdatasync -149 i386 _sysctl sys_sysctl compat_sys_sysctl -150 i386 mlock sys_mlock -151 i386 munlock sys_munlock -152 i386 mlockall sys_mlockall -153 i386 munlockall sys_munlockall -154 i386 sched_setparam sys_sched_setparam -155 i386 sched_getparam sys_sched_getparam -156 i386 sched_setscheduler sys_sched_setscheduler -157 i386 sched_getscheduler sys_sched_getscheduler -158 i386 sched_yield sys_sched_yield -159 i386 sched_get_priority_max sys_sched_get_priority_max -160 i386 sched_get_priority_min sys_sched_get_priority_min -161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162 i386 nanosleep sys_nanosleep compat_sys_nanosleep -163 i386 mremap sys_mremap -164 i386 setresuid sys_setresuid16 -165 i386 getresuid sys_getresuid16 -166 i386 vm86 sys_vm86 sys_ni_syscall -167 i386 query_module -168 i386 poll sys_poll -169 i386 nfsservctl -170 i386 setresgid sys_setresgid16 -171 i386 getresgid sys_getresgid16 -172 i386 prctl sys_prctl -173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn -174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction -175 i386 rt_sigprocmask sys_rt_sigprocmask -176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait -178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo -179 i386 rt_sigsuspend sys_rt_sigsuspend -180 i386 pread64 sys_pread64 sys32_pread -181 i386 pwrite64 sys_pwrite64 sys32_pwrite -182 i386 chown sys_chown16 -183 i386 getcwd sys_getcwd -184 i386 capget sys_capget -185 i386 capset sys_capset -186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack -187 i386 sendfile sys_sendfile compat_sys_sendfile -188 i386 getpmsg -189 i386 putpmsg -190 i386 vfork sys_vfork stub32_vfork -191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit -192 i386 mmap2 sys_mmap_pgoff -193 i386 truncate64 sys_truncate64 sys32_truncate64 -194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64 -195 i386 stat64 sys_stat64 sys32_stat64 -196 i386 lstat64 sys_lstat64 sys32_lstat64 -197 i386 fstat64 sys_fstat64 sys32_fstat64 -198 i386 lchown32 sys_lchown -199 i386 getuid32 sys_getuid -200 i386 getgid32 sys_getgid -201 i386 geteuid32 sys_geteuid -202 i386 getegid32 sys_getegid -203 i386 setreuid32 sys_setreuid -204 i386 setregid32 sys_setregid -205 i386 getgroups32 sys_getgroups -206 i386 setgroups32 sys_setgroups -207 i386 fchown32 sys_fchown -208 i386 setresuid32 sys_setresuid -209 i386 getresuid32 sys_getresuid -210 i386 setresgid32 sys_setresgid -211 i386 getresgid32 sys_getresgid -212 i386 chown32 sys_chown -213 i386 setuid32 sys_setuid -214 i386 setgid32 sys_setgid -215 i386 setfsuid32 sys_setfsuid -216 i386 setfsgid32 sys_setfsgid -217 i386 pivot_root sys_pivot_root -218 i386 mincore sys_mincore -219 i386 madvise sys_madvise -220 i386 getdents64 sys_getdents64 compat_sys_getdents64 -221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 -# 222 is unused -# 223 is unused -224 i386 gettid sys_gettid -225 i386 readahead sys_readahead sys32_readahead -226 i386 setxattr sys_setxattr -227 i386 lsetxattr sys_lsetxattr -228 i386 fsetxattr sys_fsetxattr -229 i386 getxattr sys_getxattr -230 i386 lgetxattr sys_lgetxattr -231 i386 fgetxattr sys_fgetxattr -232 i386 listxattr sys_listxattr -233 i386 llistxattr sys_llistxattr -234 i386 flistxattr sys_flistxattr -235 i386 removexattr sys_removexattr -236 i386 lremovexattr sys_lremovexattr -237 i386 fremovexattr sys_fremovexattr -238 i386 tkill sys_tkill -239 i386 sendfile64 sys_sendfile64 -240 i386 futex sys_futex compat_sys_futex -241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity -242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity -243 i386 set_thread_area sys_set_thread_area -244 i386 get_thread_area sys_get_thread_area -245 i386 io_setup sys_io_setup compat_sys_io_setup -246 i386 io_destroy sys_io_destroy -247 i386 io_getevents sys_io_getevents compat_sys_io_getevents -248 i386 io_submit sys_io_submit compat_sys_io_submit -249 i386 io_cancel sys_io_cancel -250 i386 fadvise64 sys_fadvise64 sys32_fadvise64 -# 251 is available for reuse (was briefly sys_set_zone_reclaim) -252 i386 exit_group sys_exit_group -253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie -254 i386 epoll_create sys_epoll_create -255 i386 epoll_ctl sys_epoll_ctl -256 i386 epoll_wait sys_epoll_wait -257 i386 remap_file_pages sys_remap_file_pages -258 i386 set_tid_address sys_set_tid_address -259 i386 timer_create sys_timer_create compat_sys_timer_create -260 i386 timer_settime sys_timer_settime compat_sys_timer_settime -261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime -262 i386 timer_getoverrun sys_timer_getoverrun -263 i386 timer_delete sys_timer_delete -264 i386 clock_settime sys_clock_settime compat_sys_clock_settime -265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime -266 i386 clock_getres sys_clock_getres compat_sys_clock_getres -267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep -268 i386 statfs64 sys_statfs64 compat_sys_statfs64 -269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -270 i386 tgkill sys_tgkill -271 i386 utimes sys_utimes compat_sys_utimes -272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 -273 i386 vserver -274 i386 mbind sys_mbind -275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -276 i386 set_mempolicy sys_set_mempolicy -277 i386 mq_open sys_mq_open compat_sys_mq_open -278 i386 mq_unlink sys_mq_unlink -279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive -281 i386 mq_notify sys_mq_notify compat_sys_mq_notify -282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr -283 i386 kexec_load sys_kexec_load compat_sys_kexec_load -284 i386 waitid sys_waitid compat_sys_waitid -# 285 sys_setaltroot -286 i386 add_key sys_add_key -287 i386 request_key sys_request_key -288 i386 keyctl sys_keyctl -289 i386 ioprio_set sys_ioprio_set -290 i386 ioprio_get sys_ioprio_get -291 i386 inotify_init sys_inotify_init -292 i386 inotify_add_watch sys_inotify_add_watch -293 i386 inotify_rm_watch sys_inotify_rm_watch -294 i386 migrate_pages sys_migrate_pages -295 i386 openat sys_openat compat_sys_openat -296 i386 mkdirat sys_mkdirat -297 i386 mknodat sys_mknodat -298 i386 fchownat sys_fchownat -299 i386 futimesat sys_futimesat compat_sys_futimesat -300 i386 fstatat64 sys_fstatat64 sys32_fstatat -301 i386 unlinkat sys_unlinkat -302 i386 renameat sys_renameat -303 i386 linkat sys_linkat -304 i386 symlinkat sys_symlinkat -305 i386 readlinkat sys_readlinkat -306 i386 fchmodat sys_fchmodat -307 i386 faccessat sys_faccessat -308 i386 pselect6 sys_pselect6 compat_sys_pselect6 -309 i386 ppoll sys_ppoll compat_sys_ppoll -310 i386 unshare sys_unshare -311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list -312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list -313 i386 splice sys_splice -314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range -315 i386 tee sys_tee -316 i386 vmsplice sys_vmsplice compat_sys_vmsplice -317 i386 move_pages sys_move_pages compat_sys_move_pages -318 i386 getcpu sys_getcpu -319 i386 epoll_pwait sys_epoll_pwait -320 i386 utimensat sys_utimensat compat_sys_utimensat -321 i386 signalfd sys_signalfd compat_sys_signalfd -322 i386 timerfd_create sys_timerfd_create -323 i386 eventfd sys_eventfd -324 i386 fallocate sys_fallocate sys32_fallocate -325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime -326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime -327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 -328 i386 eventfd2 sys_eventfd2 -329 i386 epoll_create1 sys_epoll_create1 -330 i386 dup3 sys_dup3 -331 i386 pipe2 sys_pipe2 -332 i386 inotify_init1 sys_inotify_init1 -333 i386 preadv sys_preadv compat_sys_preadv -334 i386 pwritev sys_pwritev compat_sys_pwritev -335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -336 i386 perf_event_open sys_perf_event_open -337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg -338 i386 fanotify_init sys_fanotify_init -339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark -340 i386 prlimit64 sys_prlimit64 -341 i386 name_to_handle_at sys_name_to_handle_at -342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime -344 i386 syncfs sys_syncfs -345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg -346 i386 setns sys_setns -347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv -348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev -349 i386 kcmp sys_kcmp -350 i386 finit_module sys_finit_module -351 i386 sched_setattr sys_sched_setattr -352 i386 sched_getattr sys_sched_getattr -353 i386 renameat2 sys_renameat2 -354 i386 seccomp sys_seccomp -355 i386 getrandom sys_getrandom -356 i386 memfd_create sys_memfd_create -357 i386 bpf sys_bpf -358 i386 execveat sys_execveat stub32_execveat diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl deleted file mode 100644 index 9ef32d5f1b19..000000000000 --- a/arch/x86/syscalls/syscall_64.tbl +++ /dev/null @@ -1,370 +0,0 @@ -# -# 64-bit system call numbers and entry vectors -# -# The format is: -# -# -# The abi is "common", "64" or "x32" for this file. -# -0 common read sys_read -1 common write sys_write -2 common open sys_open -3 common close sys_close -4 common stat sys_newstat -5 common fstat sys_newfstat -6 common lstat sys_newlstat -7 common poll sys_poll -8 common lseek sys_lseek -9 common mmap sys_mmap -10 common mprotect sys_mprotect -11 common munmap sys_munmap -12 common brk sys_brk -13 64 rt_sigaction sys_rt_sigaction -14 common rt_sigprocmask sys_rt_sigprocmask -15 64 rt_sigreturn stub_rt_sigreturn -16 64 ioctl sys_ioctl -17 common pread64 sys_pread64 -18 common pwrite64 sys_pwrite64 -19 64 readv sys_readv -20 64 writev sys_writev -21 common access sys_access -22 common pipe sys_pipe -23 common select sys_select -24 common sched_yield sys_sched_yield -25 common mremap sys_mremap -26 common msync sys_msync -27 common mincore sys_mincore -28 common madvise sys_madvise -29 common shmget sys_shmget -30 common shmat sys_shmat -31 common shmctl sys_shmctl -32 common dup sys_dup -33 common dup2 sys_dup2 -34 common pause sys_pause -35 common nanosleep sys_nanosleep -36 common getitimer sys_getitimer -37 common alarm sys_alarm -38 common setitimer sys_setitimer -39 common getpid sys_getpid -40 common sendfile sys_sendfile64 -41 common socket sys_socket -42 common connect sys_connect -43 common accept sys_accept -44 common sendto sys_sendto -45 64 recvfrom sys_recvfrom -46 64 sendmsg sys_sendmsg -47 64 recvmsg sys_recvmsg -48 common shutdown sys_shutdown -49 common bind sys_bind -50 common listen sys_listen -51 common getsockname sys_getsockname -52 common getpeername sys_getpeername -53 common socketpair sys_socketpair -54 64 setsockopt sys_setsockopt -55 64 getsockopt sys_getsockopt -56 common clone stub_clone -57 common fork stub_fork -58 common vfork stub_vfork -59 64 execve stub_execve -60 common exit sys_exit -61 common wait4 sys_wait4 -62 common kill sys_kill -63 common uname sys_newuname -64 common semget sys_semget -65 common semop sys_semop -66 common semctl sys_semctl -67 common shmdt sys_shmdt -68 common msgget sys_msgget -69 common msgsnd sys_msgsnd -70 common msgrcv sys_msgrcv -71 common msgctl sys_msgctl -72 common fcntl sys_fcntl -73 common flock sys_flock -74 common fsync sys_fsync -75 common fdatasync sys_fdatasync -76 common truncate sys_truncate -77 common ftruncate sys_ftruncate -78 common getdents sys_getdents -79 common getcwd sys_getcwd -80 common chdir sys_chdir -81 common fchdir sys_fchdir -82 common rename sys_rename -83 common mkdir sys_mkdir -84 common rmdir sys_rmdir -85 common creat sys_creat -86 common link sys_link -87 common unlink sys_unlink -88 common symlink sys_symlink -89 common readlink sys_readlink -90 common chmod sys_chmod -91 common fchmod sys_fchmod -92 common chown sys_chown -93 common fchown sys_fchown -94 common lchown sys_lchown -95 common umask sys_umask -96 common gettimeofday sys_gettimeofday -97 common getrlimit sys_getrlimit -98 common getrusage sys_getrusage -99 common sysinfo sys_sysinfo -100 common times sys_times -101 64 ptrace sys_ptrace -102 common getuid sys_getuid -103 common syslog sys_syslog -104 common getgid sys_getgid -105 common setuid sys_setuid -106 common setgid sys_setgid -107 common geteuid sys_geteuid -108 common getegid sys_getegid -109 common setpgid sys_setpgid -110 common getppid sys_getppid -111 common getpgrp sys_getpgrp -112 common setsid sys_setsid -113 common setreuid sys_setreuid -114 common setregid sys_setregid -115 common getgroups sys_getgroups -116 common setgroups sys_setgroups -117 common setresuid sys_setresuid -118 common getresuid sys_getresuid -119 common setresgid sys_setresgid -120 common getresgid sys_getresgid -121 common getpgid sys_getpgid -122 common setfsuid sys_setfsuid -123 common setfsgid sys_setfsgid -124 common getsid sys_getsid -125 common capget sys_capget -126 common capset sys_capset -127 64 rt_sigpending sys_rt_sigpending -128 64 rt_sigtimedwait sys_rt_sigtimedwait -129 64 rt_sigqueueinfo sys_rt_sigqueueinfo -130 common rt_sigsuspend sys_rt_sigsuspend -131 64 sigaltstack sys_sigaltstack -132 common utime sys_utime -133 common mknod sys_mknod -134 64 uselib -135 common personality sys_personality -136 common ustat sys_ustat -137 common statfs sys_statfs -138 common fstatfs sys_fstatfs -139 common sysfs sys_sysfs -140 common getpriority sys_getpriority -141 common setpriority sys_setpriority -142 common sched_setparam sys_sched_setparam -143 common sched_getparam sys_sched_getparam -144 common sched_setscheduler sys_sched_setscheduler -145 common sched_getscheduler sys_sched_getscheduler -146 common sched_get_priority_max sys_sched_get_priority_max -147 common sched_get_priority_min sys_sched_get_priority_min -148 common sched_rr_get_interval sys_sched_rr_get_interval -149 common mlock sys_mlock -150 common munlock sys_munlock -151 common mlockall sys_mlockall -152 common munlockall sys_munlockall -153 common vhangup sys_vhangup -154 common modify_ldt sys_modify_ldt -155 common pivot_root sys_pivot_root -156 64 _sysctl sys_sysctl -157 common prctl sys_prctl -158 common arch_prctl sys_arch_prctl -159 common adjtimex sys_adjtimex -160 common setrlimit sys_setrlimit -161 common chroot sys_chroot -162 common sync sys_sync -163 common acct sys_acct -164 common settimeofday sys_settimeofday -165 common mount sys_mount -166 common umount2 sys_umount -167 common swapon sys_swapon -168 common swapoff sys_swapoff -169 common reboot sys_reboot -170 common sethostname sys_sethostname -171 common setdomainname sys_setdomainname -172 common iopl sys_iopl -173 common ioperm sys_ioperm -174 64 create_module -175 common init_module sys_init_module -176 common delete_module sys_delete_module -177 64 get_kernel_syms -178 64 query_module -179 common quotactl sys_quotactl -180 64 nfsservctl -181 common getpmsg -182 common putpmsg -183 common afs_syscall -184 common tuxcall -185 common security -186 common gettid sys_gettid -187 common readahead sys_readahead -188 common setxattr sys_setxattr -189 common lsetxattr sys_lsetxattr -190 common fsetxattr sys_fsetxattr -191 common getxattr sys_getxattr -192 common lgetxattr sys_lgetxattr -193 common fgetxattr sys_fgetxattr -194 common listxattr sys_listxattr -195 common llistxattr sys_llistxattr -196 common flistxattr sys_flistxattr -197 common removexattr sys_removexattr -198 common lremovexattr sys_lremovexattr -199 common fremovexattr sys_fremovexattr -200 common tkill sys_tkill -201 common time sys_time -202 common futex sys_futex -203 common sched_setaffinity sys_sched_setaffinity -204 common sched_getaffinity sys_sched_getaffinity -205 64 set_thread_area -206 64 io_setup sys_io_setup -207 common io_destroy sys_io_destroy -208 common io_getevents sys_io_getevents -209 64 io_submit sys_io_submit -210 common io_cancel sys_io_cancel -211 64 get_thread_area -212 common lookup_dcookie sys_lookup_dcookie -213 common epoll_create sys_epoll_create -214 64 epoll_ctl_old -215 64 epoll_wait_old -216 common remap_file_pages sys_remap_file_pages -217 common getdents64 sys_getdents64 -218 common set_tid_address sys_set_tid_address -219 common restart_syscall sys_restart_syscall -220 common semtimedop sys_semtimedop -221 common fadvise64 sys_fadvise64 -222 64 timer_create sys_timer_create -223 common timer_settime sys_timer_settime -224 common timer_gettime sys_timer_gettime -225 common timer_getoverrun sys_timer_getoverrun -226 common timer_delete sys_timer_delete -227 common clock_settime sys_clock_settime -228 common clock_gettime sys_clock_gettime -229 common clock_getres sys_clock_getres -230 common clock_nanosleep sys_clock_nanosleep -231 common exit_group sys_exit_group -232 common epoll_wait sys_epoll_wait -233 common epoll_ctl sys_epoll_ctl -234 common tgkill sys_tgkill -235 common utimes sys_utimes -236 64 vserver -237 common mbind sys_mbind -238 common set_mempolicy sys_set_mempolicy -239 common get_mempolicy sys_get_mempolicy -240 common mq_open sys_mq_open -241 common mq_unlink sys_mq_unlink -242 common mq_timedsend sys_mq_timedsend -243 common mq_timedreceive sys_mq_timedreceive -244 64 mq_notify sys_mq_notify -245 common mq_getsetattr sys_mq_getsetattr -246 64 kexec_load sys_kexec_load -247 64 waitid sys_waitid -248 common add_key sys_add_key -249 common request_key sys_request_key -250 common keyctl sys_keyctl -251 common ioprio_set sys_ioprio_set -252 common ioprio_get sys_ioprio_get -253 common inotify_init sys_inotify_init -254 common inotify_add_watch sys_inotify_add_watch -255 common inotify_rm_watch sys_inotify_rm_watch -256 common migrate_pages sys_migrate_pages -257 common openat sys_openat -258 common mkdirat sys_mkdirat -259 common mknodat sys_mknodat -260 common fchownat sys_fchownat -261 common futimesat sys_futimesat -262 common newfstatat sys_newfstatat -263 common unlinkat sys_unlinkat -264 common renameat sys_renameat -265 common linkat sys_linkat -266 common symlinkat sys_symlinkat -267 common readlinkat sys_readlinkat -268 common fchmodat sys_fchmodat -269 common faccessat sys_faccessat -270 common pselect6 sys_pselect6 -271 common ppoll sys_ppoll -272 common unshare sys_unshare -273 64 set_robust_list sys_set_robust_list -274 64 get_robust_list sys_get_robust_list -275 common splice sys_splice -276 common tee sys_tee -277 common sync_file_range sys_sync_file_range -278 64 vmsplice sys_vmsplice -279 64 move_pages sys_move_pages -280 common utimensat sys_utimensat -281 common epoll_pwait sys_epoll_pwait -282 common signalfd sys_signalfd -283 common timerfd_create sys_timerfd_create -284 common eventfd sys_eventfd -285 common fallocate sys_fallocate -286 common timerfd_settime sys_timerfd_settime -287 common timerfd_gettime sys_timerfd_gettime -288 common accept4 sys_accept4 -289 common signalfd4 sys_signalfd4 -290 common eventfd2 sys_eventfd2 -291 common epoll_create1 sys_epoll_create1 -292 common dup3 sys_dup3 -293 common pipe2 sys_pipe2 -294 common inotify_init1 sys_inotify_init1 -295 64 preadv sys_preadv -296 64 pwritev sys_pwritev -297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo -298 common perf_event_open sys_perf_event_open -299 64 recvmmsg sys_recvmmsg -300 common fanotify_init sys_fanotify_init -301 common fanotify_mark sys_fanotify_mark -302 common prlimit64 sys_prlimit64 -303 common name_to_handle_at sys_name_to_handle_at -304 common open_by_handle_at sys_open_by_handle_at -305 common clock_adjtime sys_clock_adjtime -306 common syncfs sys_syncfs -307 64 sendmmsg sys_sendmmsg -308 common setns sys_setns -309 common getcpu sys_getcpu -310 64 process_vm_readv sys_process_vm_readv -311 64 process_vm_writev sys_process_vm_writev -312 common kcmp sys_kcmp -313 common finit_module sys_finit_module -314 common sched_setattr sys_sched_setattr -315 common sched_getattr sys_sched_getattr -316 common renameat2 sys_renameat2 -317 common seccomp sys_seccomp -318 common getrandom sys_getrandom -319 common memfd_create sys_memfd_create -320 common kexec_file_load sys_kexec_file_load -321 common bpf sys_bpf -322 64 execveat stub_execveat - -# -# x32-specific system call numbers start at 512 to avoid cache impact -# for native 64-bit operation. -# -512 x32 rt_sigaction compat_sys_rt_sigaction -513 x32 rt_sigreturn stub_x32_rt_sigreturn -514 x32 ioctl compat_sys_ioctl -515 x32 readv compat_sys_readv -516 x32 writev compat_sys_writev -517 x32 recvfrom compat_sys_recvfrom -518 x32 sendmsg compat_sys_sendmsg -519 x32 recvmsg compat_sys_recvmsg -520 x32 execve stub_x32_execve -521 x32 ptrace compat_sys_ptrace -522 x32 rt_sigpending compat_sys_rt_sigpending -523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait -524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo -525 x32 sigaltstack compat_sys_sigaltstack -526 x32 timer_create compat_sys_timer_create -527 x32 mq_notify compat_sys_mq_notify -528 x32 kexec_load compat_sys_kexec_load -529 x32 waitid compat_sys_waitid -530 x32 set_robust_list compat_sys_set_robust_list -531 x32 get_robust_list compat_sys_get_robust_list -532 x32 vmsplice compat_sys_vmsplice -533 x32 move_pages compat_sys_move_pages -534 x32 preadv compat_sys_preadv64 -535 x32 pwritev compat_sys_pwritev64 -536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -537 x32 recvmmsg compat_sys_recvmmsg -538 x32 sendmmsg compat_sys_sendmmsg -539 x32 process_vm_readv compat_sys_process_vm_readv -540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt -543 x32 io_setup compat_sys_io_setup -544 x32 io_submit compat_sys_io_submit -545 x32 execveat stub_x32_execveat diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh deleted file mode 100644 index 31fd5f1f38f7..000000000000 --- a/arch/x86/syscalls/syscallhdr.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_ASM_X86_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - echo "#ifndef ${fileguard}" - echo "#define ${fileguard} 1" - echo "" - - while read nr abi name entry ; do - if [ -z "$offset" ]; then - echo "#define __NR_${prefix}${name} $nr" - else - echo "#define __NR_${prefix}${name} ($offset + $nr)" - fi - done - - echo "" - echo "#endif /* ${fileguard} */" -) > "$out" diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh deleted file mode 100644 index 0e7f8ec071e7..000000000000 --- a/arch/x86/syscalls/syscalltbl.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh - -in="$1" -out="$2" - -grep '^[0-9]' "$in" | sort -n | ( - while read nr abi name entry compat; do - abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - if [ -n "$compat" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $compat)" - elif [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $entry)" - fi - done -) > "$out" -- cgit v1.2.1 From 00398a0018d1334fedabfeaabd0fa563121de612 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:41:06 +0200 Subject: x86/asm/entry: Move the vsyscall code to arch/x86/entry/vsyscall/ The vsyscall code is entry code too, so move it to arch/x86/entry/vsyscall/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 6 +- arch/x86/entry/syscall_32.c | 33 +++ arch/x86/entry/syscall_64.c | 32 +++ arch/x86/entry/vsyscall/Makefile | 7 + arch/x86/entry/vsyscall/vsyscall_64.c | 335 ++++++++++++++++++++++++++++++ arch/x86/entry/vsyscall/vsyscall_emu_64.S | 37 ++++ arch/x86/entry/vsyscall/vsyscall_gtod.c | 70 +++++++ arch/x86/entry/vsyscall/vsyscall_trace.h | 29 +++ arch/x86/kernel/Makefile | 3 - arch/x86/kernel/syscall_32.c | 33 --- arch/x86/kernel/syscall_64.c | 32 --- arch/x86/kernel/vsyscall_64.c | 335 ------------------------------ arch/x86/kernel/vsyscall_emu_64.S | 37 ---- arch/x86/kernel/vsyscall_gtod.c | 70 ------- arch/x86/kernel/vsyscall_trace.h | 29 --- 15 files changed, 547 insertions(+), 541 deletions(-) create mode 100644 arch/x86/entry/syscall_32.c create mode 100644 arch/x86/entry/syscall_64.c create mode 100644 arch/x86/entry/vsyscall/Makefile create mode 100644 arch/x86/entry/vsyscall/vsyscall_64.c create mode 100644 arch/x86/entry/vsyscall/vsyscall_emu_64.S create mode 100644 arch/x86/entry/vsyscall/vsyscall_gtod.c create mode 100644 arch/x86/entry/vsyscall/vsyscall_trace.h delete mode 100644 arch/x86/kernel/syscall_32.c delete mode 100644 arch/x86/kernel/syscall_64.c delete mode 100644 arch/x86/kernel/vsyscall_64.c delete mode 100644 arch/x86/kernel/vsyscall_emu_64.S delete mode 100644 arch/x86/kernel/vsyscall_gtod.c delete mode 100644 arch/x86/kernel/vsyscall_trace.h (limited to 'arch/x86') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9df72c8a0ac2..b93cce1c6bb2 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,10 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o thunk_$(BITS).o +obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o + obj-y += vdso/ +obj-y += vsyscall/ -obj-$(CONFIG_IA32_EMULATION) += ia32entry.o +obj-$(CONFIG_IA32_EMULATION) += ia32entry.o syscall_32.o diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c new file mode 100644 index 000000000000..3777189c4a19 --- /dev/null +++ b/arch/x86/entry/syscall_32.c @@ -0,0 +1,33 @@ +/* System call table for i386. */ + +#include +#include +#include +#include + +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c new file mode 100644 index 000000000000..4ac730b37f0b --- /dev/null +++ b/arch/x86/entry/syscall_64.c @@ -0,0 +1,32 @@ +/* System call table for x86-64. */ + +#include +#include +#include +#include +#include + +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif + +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_64 + +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, + +extern void sys_ni_syscall(void); + +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile new file mode 100644 index 000000000000..a9f4856f622a --- /dev/null +++ b/arch/x86/entry/vsyscall/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the x86 low level vsyscall code +# +obj-y := vsyscall_gtod.o + +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o + diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c new file mode 100644 index 000000000000..2dcc6ff6fdcc --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2012-2014 Andy Lutomirski + * + * Based on the original implementation which is: + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: + * + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. + * + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. + * + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) +{ + if (!show_unhandled_signals) + return; + + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; +} + +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_nr = X86_TRAP_PF; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +{ + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr, syscall_nr, tmp; + int prev_sig_on_uaccess_error; + long ret; + + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ + + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; + } + + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); + goto sigsegv; + } + + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); + goto sigsegv; + } + + tsk = current; + + /* + * Check for access_ok violations and find the syscall nr. + * + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, NULL means "don't write anything" not "write it at + * address 0". + */ + switch (vsyscall_nr) { + case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + regs->orig_ax = -1; + if (tmp) + goto do_ret; /* skip requested */ + + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + NULL); + break; + } + + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + +check_fault: + if (ret == -EFAULT) { + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ + } + + regs->ax = ret; + +do_ret: + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; + return true; + +sigsegv: + force_sig(SIGSEGV, current); + return true; +} + +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + if (vsyscall_mode == NONE) + return NULL; + return &gate_vma; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +void __init map_vsyscall(void) +{ + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S new file mode 100644 index 000000000000..c9596a9af159 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S @@ -0,0 +1,37 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include + +#include +#include +#include + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret + + .balign 4096, 0xcc + + .size __vsyscall_page, 4096 diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c new file mode 100644 index 000000000000..51e330416995 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include +#include +#include + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr_mono.cycle_last; + vdata->mask = tk->tkr_mono.mask; + vdata->mult = tk->tkr_mono.mult; + vdata->shift = tk->tkr_mono.shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->tkr_mono.shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h new file mode 100644 index 000000000000..9dd7359a38a8 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ +#define TRACE_INCLUDE_FILE vsyscall_trace +#include diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9d3ee054453d..01663ee5f1b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,9 +31,6 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o -obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_IA32_EMULATION) += syscall_32.o -obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c deleted file mode 100644 index 3777189c4a19..000000000000 --- a/arch/x86/kernel/syscall_32.c +++ /dev/null @@ -1,33 +0,0 @@ -/* System call table for i386. */ - -#include -#include -#include -#include - -#ifdef CONFIG_IA32_EMULATION -#define SYM(sym, compat) compat -#else -#define SYM(sym, compat) sym -#define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max -#endif - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; -#include -#undef __SYSCALL_I386 - -#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), - -typedef asmlinkage void (*sys_call_ptr_t)(void); - -extern asmlinkage void sys_ni_syscall(void); - -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c deleted file mode 100644 index 4ac730b37f0b..000000000000 --- a/arch/x86/kernel/syscall_64.c +++ /dev/null @@ -1,32 +0,0 @@ -/* System call table for x86-64. */ - -#include -#include -#include -#include -#include - -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) - -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif - -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; -#include -#undef __SYSCALL_64 - -#define __SYSCALL_64(nr, sym, compat) [nr] = sym, - -extern void sys_ni_syscall(void); - -asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c deleted file mode 100644 index 2dcc6ff6fdcc..000000000000 --- a/arch/x86/kernel/vsyscall_64.c +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2012-2014 Andy Lutomirski - * - * Based on the original implementation which is: - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Parts of the original code have been moved to arch/x86/vdso/vma.c - * - * This file implements vsyscall emulation. vsyscalls are a legacy ABI: - * Userspace can request certain kernel services by calling fixed - * addresses. This concept is problematic: - * - * - It interferes with ASLR. - * - It's awkward to write code that lives in kernel addresses but is - * callable by userspace at fixed addresses. - * - The whole concept is impossible for 32-bit compat userspace. - * - UML cannot easily virtualize a vsyscall. - * - * As of mid-2014, I believe that there is no new userspace code that - * will use a vsyscall if the vDSO is present. I hope that there will - * soon be no new userspace code that will ever use a vsyscall. - * - * The code in this file emulates vsyscalls when notified of a page - * fault to a vsyscall address. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include "vsyscall_trace.h" - -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; - -static int __init vsyscall_setup(char *str) -{ - if (str) { - if (!strcmp("emulate", str)) - vsyscall_mode = EMULATE; - else if (!strcmp("native", str)) - vsyscall_mode = NATIVE; - else if (!strcmp("none", str)) - vsyscall_mode = NONE; - else - return -EINVAL; - - return 0; - } - - return -EINVAL; -} -early_param("vsyscall", vsyscall_setup); - -static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, - const char *message) -{ - if (!show_unhandled_signals) - return; - - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, current->comm, task_pid_nr(current), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); -} - -static int addr_to_vsyscall_nr(unsigned long addr) -{ - int nr; - - if ((addr & ~0xC00UL) != VSYSCALL_ADDR) - return -EINVAL; - - nr = (addr & 0xC00UL) >> 10; - if (nr >= 3) - return -EINVAL; - - return nr; -} - -static bool write_ok_or_segv(unsigned long ptr, size_t size) -{ - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_error, this could go away. - */ - - if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { - siginfo_t info; - struct thread_struct *thread = ¤t->thread; - - thread->error_code = 6; /* user fault, no page, write */ - thread->cr2 = ptr; - thread->trap_nr = X86_TRAP_PF; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = (void __user *)ptr; - - force_sig_info(SIGSEGV, &info, current); - return false; - } else { - return true; - } -} - -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) -{ - struct task_struct *tsk; - unsigned long caller; - int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_error; - long ret; - - /* - * No point in checking CS -- the only way to get here is a user mode - * trap to a high address, which means that we're in 64-bit user code. - */ - - WARN_ON_ONCE(address != regs->ip); - - if (vsyscall_mode == NONE) { - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall attempted with vsyscall=none"); - return false; - } - - vsyscall_nr = addr_to_vsyscall_nr(address); - - trace_emulate_vsyscall(vsyscall_nr); - - if (vsyscall_nr < 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); - goto sigsegv; - } - - if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "vsyscall with bad stack (exploit attempt?)"); - goto sigsegv; - } - - tsk = current; - - /* - * Check for access_ok violations and find the syscall nr. - * - * NULL is a valid user pointer (in the access_ok sense) on 32-bit and - * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, NULL means "don't write anything" not "write it at - * address 0". - */ - switch (vsyscall_nr) { - case 0: - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_gettimeofday; - break; - - case 1: - if (!write_ok_or_segv(regs->di, sizeof(time_t))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_time; - break; - - case 2: - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_getcpu; - break; - } - - /* - * Handle seccomp. regs->ip must be the original value. - * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. - * - * We could optimize the seccomp disabled case, but performance - * here doesn't matter. - */ - regs->orig_ax = syscall_nr; - regs->ax = -ENOSYS; - tmp = secure_computing(); - if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { - warn_bad_vsyscall(KERN_DEBUG, regs, - "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); - } - regs->orig_ax = -1; - if (tmp) - goto do_ret; /* skip requested */ - - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; - - ret = -EFAULT; - switch (vsyscall_nr) { - case 0: - ret = sys_gettimeofday( - (struct timeval __user *)regs->di, - (struct timezone __user *)regs->si); - break; - - case 1: - ret = sys_time((time_t __user *)regs->di); - break; - - case 2: - ret = sys_getcpu((unsigned __user *)regs->di, - (unsigned __user *)regs->si, - NULL); - break; - } - - current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - -check_fault: - if (ret == -EFAULT) { - /* Bad news -- userspace fed a bad pointer to a vsyscall. */ - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ - } - - regs->ax = ret; - -do_ret: - /* Emulate a ret instruction. */ - regs->ip = caller; - regs->sp += 8; - return true; - -sigsegv: - force_sig(SIGSEGV, current); - return true; -} - -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - if (vsyscall_mode == NONE) - return NULL; - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - -void __init map_vsyscall(void) -{ - extern char __vsyscall_page; - unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - - if (vsyscall_mode != NONE) - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); - - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != - (unsigned long)VSYSCALL_ADDR); -} diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S deleted file mode 100644 index c9596a9af159..000000000000 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ /dev/null @@ -1,37 +0,0 @@ -/* - * vsyscall_emu_64.S: Vsyscall emulation page - * - * Copyright (c) 2011 Andy Lutomirski - * - * Subject to the GNU General Public License, version 2 - */ - -#include - -#include -#include -#include - -__PAGE_ALIGNED_DATA - .globl __vsyscall_page - .balign PAGE_SIZE, 0xcc - .type __vsyscall_page, @object -__vsyscall_page: - - mov $__NR_gettimeofday, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_time, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_getcpu, %rax - syscall - ret - - .balign 4096, 0xcc - - .size __vsyscall_page, 4096 diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c deleted file mode 100644 index 51e330416995..000000000000 --- a/arch/x86/kernel/vsyscall_gtod.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Modified for x86 32 bit architecture by - * Stefani Seibold - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - */ - -#include -#include -#include - -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); - -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; - vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; -} - -void update_vsyscall(struct timekeeper *tk) -{ - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - - gtod_write_begin(vdata); - - /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - vdata->cycle_last = tk->tkr_mono.cycle_last; - vdata->mask = tk->tkr_mono.mask; - vdata->mult = tk->tkr_mono.mult; - vdata->shift = tk->tkr_mono.shift; - - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec - + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr_mono.shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - vdata->monotonic_time_sec++; - } - - vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift); - - vdata->monotonic_time_coarse_sec = - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_coarse_nsec = - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; - - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; - vdata->monotonic_time_coarse_sec++; - } - - gtod_write_end(vdata); -} diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h deleted file mode 100644 index a8b2edec54fe..000000000000 --- a/arch/x86/kernel/vsyscall_trace.h +++ /dev/null @@ -1,29 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM vsyscall - -#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define __VSYSCALL_TRACE_H - -#include - -TRACE_EVENT(emulate_vsyscall, - - TP_PROTO(int nr), - - TP_ARGS(nr), - - TP_STRUCT__entry(__field(int, nr)), - - TP_fast_assign( - __entry->nr = nr; - ), - - TP_printk("nr = %d", __entry->nr) -); - -#endif - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../arch/x86/kernel -#define TRACE_INCLUDE_FILE vsyscall_trace -#include -- cgit v1.2.1 From e194bbdf362ba7d53cfd23ba24f1a7c90ef69a74 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jun 2015 09:51:50 +0200 Subject: kvm: x86: default legacy PCI device assignment support to "n" VFIO has proved itself a much better option than KVM's built-in device assignment. It is mature, provides better isolation because it enforces ACS, and even the userspace code is being tested on a wider variety of hardware these days than the legacy support. Disable legacy device assignment by default. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 413a7bf9efbb..a0f06a5947c5 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -88,13 +88,14 @@ config KVM_MMU_AUDIT config KVM_DEVICE_ASSIGNMENT bool "KVM legacy PCI device assignment support" depends on KVM && PCI && IOMMU_API - default y + default n ---help--- Provide support for legacy PCI device assignment through KVM. The kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes much of this support. + framework through VFIO, which supersedes this support and provides + better security. - If unsure, say Y. + If unsure, say N. # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. -- cgit v1.2.1 From ce40cd3fc7fa40a6119e5fe6c0f2bc0eb4541009 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 30 May 2015 14:31:24 +0200 Subject: kvm: x86: fix kvm_apic_has_events to check for NULL pointer Malicious (or egregiously buggy) userspace can trigger it, but it should never happen in normal operation. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 71b150cae5f9..9d8fcde52027 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -150,7 +150,7 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm) static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic->pending_events; + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; } static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) -- cgit v1.2.1 From e69fab5df45f993cb3b8cc0625a7791e86450a99 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jun 2015 10:44:44 +0200 Subject: KVM: x86: clear hidden CPU state at reset time This was noticed by Radim while reviewing the implementation of system management mode. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 79dde1656db6..bd6bcd54cd44 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7254,6 +7254,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + vcpu->arch.hflags = 0; + atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; -- cgit v1.2.1 From 62ef68bb4d00f1a662e487f3fc44ce8521c416aa Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 5 May 2015 12:08:55 +0200 Subject: KVM: x86: introduce num_emulated_msrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We will want to filter away MSR_IA32_SMBASE from the emulated_msrs if the host CPU does not support SMM virtualization. Introduce the logic to do that, and also move paravirt MSRs to emulated_msrs for simplicity and to get rid of KVM_SAVE_MSRS_BEGIN. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd6bcd54cd44..7119b3b510c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -925,17 +925,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * * This list is modified at module load time to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in the beginning of the list. + * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * may depend on host virtualization features rather than host cpu features. */ -#define KVM_SAVE_MSRS_BEGIN 12 static u32 msrs_to_save[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -947,7 +941,14 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; -static const u32 emulated_msrs[] = { +static u32 emulated_msrs[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, + MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, @@ -955,6 +956,8 @@ static const u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, }; +static unsigned num_emulated_msrs; + bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) @@ -2928,7 +2931,7 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) goto out; n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) goto out; r = -E2BIG; @@ -2940,7 +2943,7 @@ long kvm_arch_dev_ioctl(struct file *filp, goto out; if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, - ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + num_emulated_msrs * sizeof(u32))) goto out; r = 0; break; @@ -4206,8 +4209,7 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i, j; - /* skip the first msrs in the list. KVM-specific */ - for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; @@ -4232,6 +4234,18 @@ static void kvm_init_msr_list(void) j++; } num_msrs_to_save = j; + + for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { + switch (emulated_msrs[i]) { + default: + break; + } + + if (j < i) + emulated_msrs[j] = emulated_msrs[i]; + j++; + } + num_emulated_msrs = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, -- cgit v1.2.1 From 609e36d372ad9329269e4a1467bd35311893d1d6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Apr 2015 15:30:38 +0200 Subject: KVM: x86: pass host_initiated to functions that read MSRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SMBASE is only readable from SMM for the VCPU, but it must be always accessible if userspace is accessing it. Thus, all functions that read MSRs are changed to accept a struct msr_data; the host_initiated and index fields are pre-initialized, while the data field is filled on return. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 +-- arch/x86/kvm/svm.c | 54 ++++++++++---------- arch/x86/kvm/vmx.c | 62 ++++++++++++----------- arch/x86/kvm/x86.c | 106 ++++++++++++++++++++++++---------------- 4 files changed, 127 insertions(+), 101 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7276107b35df..4e299fcd0eb6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -721,7 +721,7 @@ struct kvm_x86_ops { void (*vcpu_put)(struct kvm_vcpu *vcpu); void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, @@ -941,7 +941,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); +int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; @@ -970,7 +970,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b9f9e1073e50..a08df4145173 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3069,42 +3069,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) svm_scale_tsc(vcpu, host_tsc); } -static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) +static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); - switch (ecx) { + switch (msr_info->index) { case MSR_IA32_TSC: { - *data = svm->vmcb->control.tsc_offset + + msr_info->data = svm->vmcb->control.tsc_offset + svm_scale_tsc(vcpu, native_read_tsc()); break; } case MSR_STAR: - *data = svm->vmcb->save.star; + msr_info->data = svm->vmcb->save.star; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - *data = svm->vmcb->save.lstar; + msr_info->data = svm->vmcb->save.lstar; break; case MSR_CSTAR: - *data = svm->vmcb->save.cstar; + msr_info->data = svm->vmcb->save.cstar; break; case MSR_KERNEL_GS_BASE: - *data = svm->vmcb->save.kernel_gs_base; + msr_info->data = svm->vmcb->save.kernel_gs_base; break; case MSR_SYSCALL_MASK: - *data = svm->vmcb->save.sfmask; + msr_info->data = svm->vmcb->save.sfmask; break; #endif case MSR_IA32_SYSENTER_CS: - *data = svm->vmcb->save.sysenter_cs; + msr_info->data = svm->vmcb->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: - *data = svm->sysenter_eip; + msr_info->data = svm->sysenter_eip; break; case MSR_IA32_SYSENTER_ESP: - *data = svm->sysenter_esp; + msr_info->data = svm->sysenter_esp; break; /* * Nobody will change the following 5 values in the VMCB so we can @@ -3112,31 +3112,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) * implemented. */ case MSR_IA32_DEBUGCTLMSR: - *data = svm->vmcb->save.dbgctl; + msr_info->data = svm->vmcb->save.dbgctl; break; case MSR_IA32_LASTBRANCHFROMIP: - *data = svm->vmcb->save.br_from; + msr_info->data = svm->vmcb->save.br_from; break; case MSR_IA32_LASTBRANCHTOIP: - *data = svm->vmcb->save.br_to; + msr_info->data = svm->vmcb->save.br_to; break; case MSR_IA32_LASTINTFROMIP: - *data = svm->vmcb->save.last_excp_from; + msr_info->data = svm->vmcb->save.last_excp_from; break; case MSR_IA32_LASTINTTOIP: - *data = svm->vmcb->save.last_excp_to; + msr_info->data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: - *data = svm->nested.hsave_msr; + msr_info->data = svm->nested.hsave_msr; break; case MSR_VM_CR: - *data = svm->nested.vm_cr_msr; + msr_info->data = svm->nested.vm_cr_msr; break; case MSR_IA32_UCODE_REV: - *data = 0x01000065; + msr_info->data = 0x01000065; break; default: - return kvm_get_msr_common(vcpu, ecx, data); + return kvm_get_msr_common(vcpu, msr_info); } return 0; } @@ -3144,16 +3144,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) static int rdmsr_interception(struct vcpu_svm *svm) { u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); - u64 data; + struct msr_data msr_info; - if (svm_get_msr(&svm->vcpu, ecx, &data)) { + msr_info.index = ecx; + msr_info.host_initiated = false; + if (svm_get_msr(&svm->vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); } else { - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(ecx, msr_info.data); - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff); - kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32); + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, + msr_info.data & 0xffffffff); + kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, + msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; skip_emulated_instruction(&svm->vcpu); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9cf5030927d5..b5b7b8ab2f59 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2622,76 +2622,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - u64 data; struct shared_msr_entry *msr; - if (!pdata) { - printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); - return -EINVAL; - } - - switch (msr_index) { + switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: - data = vmcs_readl(GUEST_FS_BASE); + msr_info->data = vmcs_readl(GUEST_FS_BASE); break; case MSR_GS_BASE: - data = vmcs_readl(GUEST_GS_BASE); + msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: vmx_load_host_state(to_vmx(vcpu)); - data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; break; #endif case MSR_EFER: - return kvm_get_msr_common(vcpu, msr_index, pdata); + return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSC: - data = guest_read_tsc(); + msr_info->data = guest_read_tsc(); break; case MSR_IA32_SYSENTER_CS: - data = vmcs_read32(GUEST_SYSENTER_CS); + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: - data = vmcs_readl(GUEST_SYSENTER_EIP); + msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: - data = vmcs_readl(GUEST_SYSENTER_ESP); + msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!vmx_mpx_supported()) return 1; - data = vmcs_read64(GUEST_BNDCFGS); + msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; - data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; - return vmx_get_vmx_msr(vcpu, msr_index, pdata); + return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; - data = vcpu->arch.ia32_xss; + msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; /* Otherwise falls through */ default: - msr = find_msr_entry(to_vmx(vcpu), msr_index); + msr = find_msr_entry(to_vmx(vcpu), msr_info->index); if (msr) { - data = msr->data; + msr_info->data = msr->data; break; } - return kvm_get_msr_common(vcpu, msr_index, pdata); + return kvm_get_msr_common(vcpu, msr_info); } - *pdata = data; return 0; } @@ -5473,19 +5466,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu) static int handle_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data; + struct msr_data msr_info; - if (vmx_get_msr(vcpu, ecx, &data)) { + msr_info.index = ecx; + msr_info.host_initiated = false; + if (vmx_get_msr(vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(ecx, msr_info.data); /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; + vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; skip_emulated_instruction(vcpu); return 1; } @@ -9147,6 +9142,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) struct vmx_msr_entry e; for (i = 0; i < count; i++) { + struct msr_data msr_info; if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { @@ -9161,7 +9157,9 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); return -EINVAL; } - if (kvm_get_msr(vcpu, e.index, &e.value)) { + msr_info.host_initiated = false; + msr_info.index = e.index; + if (kvm_get_msr(vcpu, &msr_info)) { pr_warn_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); @@ -9170,10 +9168,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (kvm_write_guest(vcpu->kvm, gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), - &e.value, sizeof(e.value))) { + &msr_info.data, sizeof(msr_info.data))) { pr_warn_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, e.value); + __func__, i, e.index, msr_info.data); return -EINVAL; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7119b3b510c4..3632928191f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1051,6 +1051,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention */ +static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + struct msr_data msr; + int r; + + msr.index = index; + msr.host_initiated = true; + r = kvm_get_msr(vcpu, &msr); + if (r) + return r; + + *data = msr.data; + return 0; +} + static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { struct msr_data msr; @@ -2448,9 +2463,9 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); + return kvm_x86_ops->get_msr(vcpu, msr); } EXPORT_SYMBOL_GPL(kvm_get_msr); @@ -2587,11 +2602,11 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 data; - switch (msr) { + switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: @@ -2614,26 +2629,26 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: - data = 0; + msr_info->data = 0; break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_get_msr(vcpu, msr, pdata); - data = 0; + if (kvm_pmu_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + msr_info->data = 0; break; case MSR_IA32_UCODE_REV: - data = 0x100000000ULL; + msr_info->data = 0x100000000ULL; break; case MSR_MTRRcap: - data = 0x500 | KVM_NR_VAR_MTRR; + msr_info->data = 0x500 | KVM_NR_VAR_MTRR; break; case 0x200 ... 0x2ff: - return get_msr_mtrr(vcpu, msr, pdata); + return get_msr_mtrr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ - data = 3; + msr_info->data = 3; break; /* * MSR_EBC_FREQUENCY_ID @@ -2647,48 +2662,48 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * multiplying by zero otherwise. */ case MSR_EBC_FREQUENCY_ID: - data = 1 << 24; + msr_info->data = 1 << 24; break; case MSR_IA32_APICBASE: - data = kvm_get_apic_base(vcpu); + msr_info->data = kvm_get_apic_base(vcpu); break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_read(vcpu, msr, pdata); + return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_TSCDEADLINE: - data = kvm_get_lapic_tscdeadline_msr(vcpu); + msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: - data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; case MSR_IA32_MISC_ENABLE: - data = vcpu->arch.ia32_misc_enable_msr; + msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; case MSR_IA32_PERF_STATUS: /* TSC increment by tick */ - data = 1000ULL; + msr_info->data = 1000ULL; /* CPU multiplier */ data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.efer; + msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: case MSR_KVM_WALL_CLOCK_NEW: - data = vcpu->kvm->arch.wall_clock; + msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: case MSR_KVM_SYSTEM_TIME_NEW: - data = vcpu->arch.time; + msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: - data = vcpu->arch.apf.msr_val; + msr_info->data = vcpu->arch.apf.msr_val; break; case MSR_KVM_STEAL_TIME: - data = vcpu->arch.st.msr_val; + msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: - data = vcpu->arch.pv_eoi.msr_val; + msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: @@ -2696,7 +2711,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr, pdata); + return get_msr_mce(vcpu, msr_info->index, &msr_info->data); case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2707,17 +2722,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * type 6, model 8 and higher from exploding due to * the rdmsr failing. */ - data = 0x20000000; + msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { + if (kvm_hv_msr_partition_wide(msr_info->index)) { int r; mutex_lock(&vcpu->kvm->lock); - r = get_msr_hyperv_pw(vcpu, msr, pdata); + r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data); mutex_unlock(&vcpu->kvm->lock); return r; } else - return get_msr_hyperv(vcpu, msr, pdata); + return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current @@ -2730,31 +2745,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * L2 cache control register 3: 64GB range, 256KB size, * enabled, latency 0x1, configured */ - data = 0xbe702111; + msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) return 1; - data = vcpu->arch.osvw.length; + msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: if (!guest_cpuid_has_osvw(vcpu)) return 1; - data = vcpu->arch.osvw.status; + msr_info->data = vcpu->arch.osvw.status; break; default: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_get_msr(vcpu, msr, pdata); + if (kvm_pmu_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); return 1; } else { - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); - data = 0; + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); + msr_info->data = 0; } break; } - *pdata = data; return 0; } EXPORT_SYMBOL_GPL(kvm_get_msr_common); @@ -3525,7 +3539,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_MSRS: - r = msr_io(vcpu, argp, kvm_get_msr, 1); + r = msr_io(vcpu, argp, do_get_msr, 1); break; case KVM_SET_MSRS: r = msr_io(vcpu, argp, do_set_msr, 0); @@ -5056,7 +5070,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct msr_data msr; + int r; + + msr.index = msr_index; + msr.host_initiated = false; + r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); + if (r) + return r; + + *pdata = msr.data; + return 0; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, -- cgit v1.2.1 From a584539b24b87dc8be83a713006396cabec47833 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Apr 2015 18:18:53 +0200 Subject: KVM: x86: pass the whole hflags field to emulator and back MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hflags field will contain information about system management mode and will be useful for the emulator. Pass the entire field rather than just the guest-mode information. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 5 ++++- arch/x86/kvm/emulate.c | 6 +++--- arch/x86/kvm/x86.c | 10 +++++++++- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 57a9d94fe160..7410879a41f7 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -262,6 +262,9 @@ enum x86emul_mode { X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ }; +/* These match some of the HF_* flags defined in kvm_host.h */ +#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ + struct x86_emulate_ctxt { const struct x86_emulate_ops *ops; @@ -273,8 +276,8 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; + int emul_flags; - bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9b655d113fc6..a1c6c25552e9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4895,7 +4895,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -4924,7 +4924,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -4978,7 +4978,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3632928191f3..7aec25f2f45c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5240,7 +5240,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - ctxt->guest_mode = is_guest_mode(vcpu); + BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); + ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5409,6 +5410,11 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); +void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) +{ + vcpu->arch.hflags = emul_flags; +} + static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -5608,6 +5614,8 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + if (vcpu->arch.hflags != ctxt->emul_flags) + kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) kvm_vcpu_check_singlestep(vcpu, rflags, &r); -- cgit v1.2.1 From f077825a8758d79838a757dafb79adcdd047ef3a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Apr 2015 15:06:40 +0200 Subject: KVM: x86: API changes for SMM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch includes changes to the external API for SMM support. Userspace can predicate the availability of the new fields and ioctls on a new capability, KVM_CAP_X86_SMM, which is added at the end of the patch series. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/include/uapi/asm/kvm.h | 11 ++++++++++- arch/x86/kvm/kvm_cache_regs.h | 5 +++++ arch/x86/kvm/lapic.h | 5 +++++ arch/x86/kvm/x86.c | 40 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 61 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4e299fcd0eb6..d52d7aea375f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -471,6 +471,7 @@ struct kvm_vcpu_arch { atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ bool nmi_injected; /* Trying to inject an NMI this entry */ + bool smi_pending; /* SMI queued after currently running handler */ struct mtrr_state_type mtrr_state; u64 pat; @@ -1115,6 +1116,8 @@ enum { #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define HF_SMM_MASK (1 << 6) +#define HF_SMM_INSIDE_NMI_MASK (1 << 7) /* * Hardware virtualization extension instructions may fault if a diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 2fec75e4b1e1..a4ae82eb82aa 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -106,6 +106,8 @@ struct kvm_ioapic_state { #define KVM_IRQCHIP_IOAPIC 2 #define KVM_NR_IRQCHIPS 3 +#define KVM_RUN_X86_SMM (1 << 0) + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ @@ -281,6 +283,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 +#define KVM_VCPUEVENT_VALID_SMM 0x00000008 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -309,7 +312,13 @@ struct kvm_vcpu_events { } nmi; __u32 sipi_vector; __u32 flags; - __u32 reserved[10]; + struct { + __u8 smm; + __u8 pending; + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; + __u32 reserved[9]; }; /* for KVM_GET/SET_DEBUGREGS */ diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 544076c4f44b..e1e89ee4af75 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu) return vcpu->arch.hflags & HF_GUEST_MASK; } +static inline bool is_smm(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hflags & HF_SMM_MASK; +} + #endif diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 9d8fcde52027..f2f4e10ab772 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -159,6 +159,11 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) irq->msi_redir_hint); } +static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); +} + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7aec25f2f45c..aa46ac1ff48b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3101,6 +3101,11 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) return 0; } +static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) +{ + return 0; +} + static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -3206,8 +3211,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->sipi_vector = 0; /* never valid when reporting to user space */ + events->smi.smm = is_smm(vcpu); + events->smi.pending = vcpu->arch.smi_pending; + events->smi.smm_inside_nmi = + !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); + events->smi.latched_init = kvm_lapic_latched_init(vcpu); + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SHADOW); + | KVM_VCPUEVENT_VALID_SHADOW + | KVM_VCPUEVENT_VALID_SMM); memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -3216,7 +3228,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW)) + | KVM_VCPUEVENT_VALID_SHADOW + | KVM_VCPUEVENT_VALID_SMM)) return -EINVAL; process_nmi(vcpu); @@ -3241,6 +3254,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_vcpu_has_lapic(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; + if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + if (events->smi.smm) + vcpu->arch.hflags |= HF_SMM_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_MASK; + vcpu->arch.smi_pending = events->smi.pending; + if (events->smi.smm_inside_nmi) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; + if (kvm_vcpu_has_lapic(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + } + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -3500,6 +3531,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_nmi(vcpu); break; } + case KVM_SMI: { + r = kvm_vcpu_ioctl_smi(vcpu); + break; + } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; @@ -6182,6 +6217,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); if (irqchip_in_kernel(vcpu->kvm)) -- cgit v1.2.1 From 64d6067057d9658acb8675afcfba549abdb7fc16 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 7 May 2015 11:36:11 +0200 Subject: KVM: x86: stubs for SMM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds the interface between x86.c and the emulator: the SMBASE register, a new emulator flag, the RSM instruction. It also adds a new request bit that will be used by the KVM_SMI ioctl. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 4 +++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/emulate.c | 10 +++++- arch/x86/kvm/lapic.c | 4 ++- arch/x86/kvm/svm.c | 1 + arch/x86/kvm/x86.c | 64 ++++++++++++++++++++++++++++++++++++-- 6 files changed, 80 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7410879a41f7..e16466ec473c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -193,6 +193,8 @@ struct x86_emulate_ops { int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); + u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); + void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); @@ -264,6 +266,8 @@ enum x86emul_mode { /* These match some of the HF_* flags defined in kvm_host.h */ #define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define X86EMUL_SMM_MASK (1 << 6) +#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7) struct x86_emulate_ctxt { const struct x86_emulate_ops *ops; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d52d7aea375f..12a7318887ad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -368,6 +368,7 @@ struct kvm_vcpu_arch { int32_t apic_arb_prio; int mp_state; u64 ia32_misc_enable_msr; + u64 smbase; bool tpr_access_reporting; u64 ia32_xss; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a1c6c25552e9..e763a9b8c26b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2259,6 +2259,14 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) return rc; } +static int em_rsm(struct x86_emulate_ctxt *ctxt) +{ + if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + return emulate_ud(ctxt); + + return X86EMUL_UNHANDLEABLE; +} + static void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *cs, struct desc_struct *ss) @@ -4197,7 +4205,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - DI(ImplicitOps, rsm), + II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c789e00dfa8b..b8e47e2b1d94 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -808,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_SMI: - apic_debug("Ignoring guest SMI\n"); + result = 1; + kvm_make_request(KVM_REQ_SMI, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_NMI: diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a08df4145173..c48748cf638e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3394,6 +3394,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, + [SVM_EXIT_RSM] = emulate_on_interception, }; static void dump_vmcb(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aa46ac1ff48b..ab977e763812 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -954,6 +954,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_SMBASE, }; static unsigned num_emulated_msrs; @@ -2282,6 +2283,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_IA32_SMBASE: + if (!msr_info->host_initiated) + return 1; + vcpu->arch.smbase = data; + break; case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; @@ -2679,6 +2685,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MISC_ENABLE: msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; + case MSR_IA32_SMBASE: + if (!msr_info->host_initiated) + return 1; + msr_info->data = vcpu->arch.smbase; + break; case MSR_IA32_PERF_STATUS: /* TSC increment by tick */ msr_info->data = 1000ULL; @@ -3103,6 +3114,8 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_SMI, vcpu); + return 0; } @@ -5129,6 +5142,20 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } +static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + return vcpu->arch.smbase; +} + +static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + vcpu->arch.smbase = smbase; +} + static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { @@ -5214,6 +5241,8 @@ static const struct x86_emulate_ops emulate_ops = { .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, + .get_smbase = emulator_get_smbase, + .set_smbase = emulator_set_smbase, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, @@ -5276,6 +5305,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); + BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); + BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); @@ -5445,9 +5476,24 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); -void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) +static void kvm_smm_changed(struct kvm_vcpu *vcpu) { + if (!(vcpu->arch.hflags & HF_SMM_MASK)) { + if (unlikely(vcpu->arch.smi_pending)) { + kvm_make_request(KVM_REQ_SMI, vcpu); + vcpu->arch.smi_pending = 0; + } + } +} + +static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) +{ + unsigned changed = vcpu->arch.hflags ^ emul_flags; + vcpu->arch.hflags = emul_flags; + + if (changed & HF_SMM_MASK) + kvm_smm_changed(vcpu); } static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, @@ -6341,6 +6387,16 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +static void process_smi(struct kvm_vcpu *vcpu) +{ + if (is_smm(vcpu)) { + vcpu->arch.smi_pending = true; + return; + } + + printk_once(KERN_DEBUG "Ignoring guest SMI\n"); +} + static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; @@ -6449,6 +6505,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) @@ -7363,8 +7421,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (!init_event) + if (!init_event) { kvm_pmu_reset(vcpu); + vcpu->arch.smbase = 0x30000; + } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); vcpu->arch.regs_avail = ~0; -- cgit v1.2.1 From cd7764fe9f73530b20a0f2310fa753af635fabb3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jun 2015 10:41:21 +0200 Subject: KVM: x86: latch INITs while in system management mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not process INITs immediately while in system management mode, keep it instead in apic->pending_events. Tell userspace if an INIT is pending when they issue GET_VCPU_EVENTS, and similarly handle the new field in SET_VCPU_EVENTS. Note that the same treatment should be done while in VMX non-root mode. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 13 ++++++++++++- arch/x86/kvm/x86.c | 3 +++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b8e47e2b1d94..beeef05bb4d9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2057,8 +2057,19 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) return; - pe = xchg(&apic->pending_events, 0); + /* + * INITs are latched while in SMM. Because an SMM CPU cannot + * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs + * and delay processing of INIT until the next RSM. + */ + if (is_smm(vcpu)) { + WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); + if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) + clear_bit(KVM_APIC_SIPI, &apic->pending_events); + return; + } + pe = xchg(&apic->pending_events, 0); if (test_bit(KVM_APIC_INIT, &pe)) { kvm_lapic_reset(vcpu, true); kvm_vcpu_reset(vcpu, true); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ab977e763812..ab2521b588d8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5482,6 +5482,9 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.smi_pending)) { kvm_make_request(KVM_REQ_SMI, vcpu); vcpu->arch.smi_pending = 0; + } else { + /* Process a latched INIT, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } } } -- cgit v1.2.1 From b44a2b53becf2485f484bd6bb6c1d963ebc339f8 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 4 Jun 2015 16:31:47 +0300 Subject: perf/x86/intel/pt: Fix a refactoring bug Commit 066450be41 ("perf/x86/intel/pt: Clean up the control flow in pt_pmu_hw_init()") changed attribute initialization so that only the first attribute gets initialized using sysfs_attr_init(), which upsets lockdep. This patch fixes the glitch so that all allocated attributes are properly initialized thus fixing the lockdep warning reported by Tvrtko and Imre. Reported-by: Tvrtko Ursulin Reported-by: Imre Deak Signed-off-by: Alexander Shishkin Cc: Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 5b804f96ad66..123ff1bb2f60 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -151,7 +151,7 @@ static int __init pt_pmu_hw_init(void) de_attr->attr.attr.name = pt_caps[i].name; - sysfs_attr_init(&de_attrs->attr.attr); + sysfs_attr_init(&de_attr->attr.attr); de_attr->attr.attr.mode = S_IRUGO; de_attr->attr.show = pt_cap_show; -- cgit v1.2.1 From 660a5d517aaab9187f93854425c4c63f4a09195c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 5 May 2015 11:50:23 +0200 Subject: KVM: x86: save/load state on SMM switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The big ugly one. This patch adds support for switching in and out of system management mode, respectively upon receiving KVM_REQ_SMI and upon executing a RSM instruction. Both 32- and 64-bit formats are supported for the SMM state save area. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 8 ++ arch/x86/kvm/emulate.c | 248 ++++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/trace.h | 22 +++++ arch/x86/kvm/x86.c | 222 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 498 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 496b3695d3d3..dd05b9cef6ae 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_LM)); +} + static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e763a9b8c26b..e7a4fde5d631 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2259,12 +2259,258 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) return rc; } +static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = 0x80000001; + ecx = 0; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + return edx & bit(X86_FEATURE_LM); +} + +#define GET_SMSTATE(type, smbase, offset) \ + ({ \ + type __val; \ + int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ + sizeof(__val), NULL); \ + if (r != X86EMUL_CONTINUE) \ + return X86EMUL_UNHANDLEABLE; \ + __val; \ + }) + +static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) +{ + desc->g = (flags >> 23) & 1; + desc->d = (flags >> 22) & 1; + desc->l = (flags >> 21) & 1; + desc->avl = (flags >> 20) & 1; + desc->p = (flags >> 15) & 1; + desc->dpl = (flags >> 13) & 3; + desc->s = (flags >> 12) & 1; + desc->type = (flags >> 8) & 15; +} + +static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +{ + struct desc_struct desc; + int offset; + u16 selector; + + selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4); + + if (n < 3) + offset = 0x7f84 + n * 12; + else + offset = 0x7f2c + (n - 3) * 12; + + set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); + return X86EMUL_CONTINUE; +} + +static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +{ + struct desc_struct desc; + int offset; + u16 selector; + u32 base3; + + offset = 0x7e00 + n * 16; + + selector = GET_SMSTATE(u16, smbase, offset); + rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); + base3 = GET_SMSTATE(u32, smbase, offset + 12); + + ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); + return X86EMUL_CONTINUE; +} + +static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, + u64 cr0, u64 cr4) +{ + int bad; + + /* + * First enable PAE, long mode needs it before CR0.PG = 1 is set. + * Then enable protected mode. However, PCID cannot be enabled + * if EFER.LMA=0, so set it separately. + */ + bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + if (bad) + return X86EMUL_UNHANDLEABLE; + + bad = ctxt->ops->set_cr(ctxt, 0, cr0); + if (bad) + return X86EMUL_UNHANDLEABLE; + + if (cr4 & X86_CR4_PCIDE) { + bad = ctxt->ops->set_cr(ctxt, 4, cr4); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + + return X86EMUL_CONTINUE; +} + +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct desc_struct desc; + struct desc_ptr dt; + u16 selector; + u32 val, cr0, cr4; + int i; + + cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); + ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); + ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + + for (i = 0; i < 8; i++) + *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4); + + val = GET_SMSTATE(u32, smbase, 0x7fcc); + ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + val = GET_SMSTATE(u32, smbase, 0x7fc8); + ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + selector = GET_SMSTATE(u32, smbase, 0x7fc4); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); + + selector = GET_SMSTATE(u32, smbase, 0x7fc0); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); + + dt.address = GET_SMSTATE(u32, smbase, 0x7f74); + dt.size = GET_SMSTATE(u32, smbase, 0x7f70); + ctxt->ops->set_gdt(ctxt, &dt); + + dt.address = GET_SMSTATE(u32, smbase, 0x7f58); + dt.size = GET_SMSTATE(u32, smbase, 0x7f54); + ctxt->ops->set_idt(ctxt, &dt); + + for (i = 0; i < 6; i++) { + int r = rsm_load_seg_32(ctxt, smbase, i); + if (r != X86EMUL_CONTINUE) + return r; + } + + cr4 = GET_SMSTATE(u32, smbase, 0x7f14); + + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + + return rsm_enter_protected_mode(ctxt, cr0, cr4); +} + +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct desc_struct desc; + struct desc_ptr dt; + u64 val, cr0, cr4; + u32 base3; + u16 selector; + int i; + + for (i = 0; i < 16; i++) + *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); + + ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78); + ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED; + + val = GET_SMSTATE(u32, smbase, 0x7f68); + ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + val = GET_SMSTATE(u32, smbase, 0x7f60); + ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + cr0 = GET_SMSTATE(u64, smbase, 0x7f58); + ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); + cr4 = GET_SMSTATE(u64, smbase, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); + val = GET_SMSTATE(u64, smbase, 0x7ed0); + ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); + + selector = GET_SMSTATE(u32, smbase, 0x7e90); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98)); + base3 = GET_SMSTATE(u32, smbase, 0x7e9c); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); + + dt.size = GET_SMSTATE(u32, smbase, 0x7e84); + dt.address = GET_SMSTATE(u64, smbase, 0x7e88); + ctxt->ops->set_idt(ctxt, &dt); + + selector = GET_SMSTATE(u32, smbase, 0x7e70); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78)); + base3 = GET_SMSTATE(u32, smbase, 0x7e7c); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); + + dt.size = GET_SMSTATE(u32, smbase, 0x7e64); + dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + ctxt->ops->set_gdt(ctxt, &dt); + + for (i = 0; i < 6; i++) { + int r = rsm_load_seg_64(ctxt, smbase, i); + if (r != X86EMUL_CONTINUE) + return r; + } + + return rsm_enter_protected_mode(ctxt, cr0, cr4); +} + static int em_rsm(struct x86_emulate_ctxt *ctxt) { + unsigned long cr0, cr4, efer; + u64 smbase; + int ret; + if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); - return X86EMUL_UNHANDLEABLE; + /* + * Get back to real mode, to prepare a safe state in which to load + * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed + * to read_std/write_std are not virtual. + * + * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. + */ + cr0 = ctxt->ops->get_cr(ctxt, 0); + if (cr0 & X86_CR0_PE) + ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + + smbase = ctxt->ops->get_smbase(ctxt); + if (emulator_has_longmode(ctxt)) + ret = rsm_load_state_64(ctxt, smbase + 0x8000); + else + ret = rsm_load_state_32(ctxt, smbase + 0x8000); + + if (ret != X86EMUL_CONTINUE) { + /* FIXME: should triple fault */ + return X86EMUL_UNHANDLEABLE; + } + + if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + ctxt->ops->set_nmi_mask(ctxt, false); + + ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; + ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + return X86EMUL_CONTINUE; } static void diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7c7bc8bef21f..4eae7c35ddf5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire, __entry->delta < 0 ? "early" : "late") ); +TRACE_EVENT(kvm_enter_smm, + TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering), + TP_ARGS(vcpu_id, smbase, entering), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( u64, smbase ) + __field( bool, entering ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->smbase = smbase; + __entry->entering = entering; + ), + + TP_printk("vcpu %u: %s SMM, smbase 0x%llx", + __entry->vcpu_id, + __entry->entering ? "entering" : "leaving", + __entry->smbase) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ab2521b588d8..51d994e1d6af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5479,6 +5479,9 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu); static void kvm_smm_changed(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.hflags & HF_SMM_MASK)) { + /* This is a good place to trace that we are exiting SMM. */ + trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); + if (unlikely(vcpu->arch.smi_pending)) { kvm_make_request(KVM_REQ_SMI, vcpu); vcpu->arch.smi_pending = 0; @@ -6390,14 +6393,231 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +#define put_smstate(type, buf, offset, val) \ + *(type *)((buf) + (offset) - 0x7e00) = val + +static u32 process_smi_get_segment_flags(struct kvm_segment *seg) +{ + u32 flags = 0; + flags |= seg->g << 23; + flags |= seg->db << 22; + flags |= seg->l << 21; + flags |= seg->avl << 20; + flags |= seg->present << 15; + flags |= seg->dpl << 13; + flags |= seg->s << 12; + flags |= seg->type << 8; + return flags; +} + +static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) +{ + struct kvm_segment seg; + int offset; + + kvm_get_segment(vcpu, &seg, n); + put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector); + + if (n < 3) + offset = 0x7f84 + n * 12; + else + offset = 0x7f2c + (n - 3) * 12; + + put_smstate(u32, buf, offset + 8, seg.base); + put_smstate(u32, buf, offset + 4, seg.limit); + put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg)); +} + +static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) +{ + struct kvm_segment seg; + int offset; + u16 flags; + + kvm_get_segment(vcpu, &seg, n); + offset = 0x7e00 + n * 16; + + flags = process_smi_get_segment_flags(&seg) >> 8; + put_smstate(u16, buf, offset, seg.selector); + put_smstate(u16, buf, offset + 2, flags); + put_smstate(u32, buf, offset + 4, seg.limit); + put_smstate(u64, buf, offset + 8, seg.base); +} + +static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) +{ + struct desc_ptr dt; + struct kvm_segment seg; + unsigned long val; + int i; + + put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); + put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); + put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); + put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); + + for (i = 0; i < 8; i++) + put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i)); + + kvm_get_dr(vcpu, 6, &val); + put_smstate(u32, buf, 0x7fcc, (u32)val); + kvm_get_dr(vcpu, 7, &val); + put_smstate(u32, buf, 0x7fc8, (u32)val); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); + put_smstate(u32, buf, 0x7fc4, seg.selector); + put_smstate(u32, buf, 0x7f64, seg.base); + put_smstate(u32, buf, 0x7f60, seg.limit); + put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg)); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); + put_smstate(u32, buf, 0x7fc0, seg.selector); + put_smstate(u32, buf, 0x7f80, seg.base); + put_smstate(u32, buf, 0x7f7c, seg.limit); + put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg)); + + kvm_x86_ops->get_gdt(vcpu, &dt); + put_smstate(u32, buf, 0x7f74, dt.address); + put_smstate(u32, buf, 0x7f70, dt.size); + + kvm_x86_ops->get_idt(vcpu, &dt); + put_smstate(u32, buf, 0x7f58, dt.address); + put_smstate(u32, buf, 0x7f54, dt.size); + + for (i = 0; i < 6; i++) + process_smi_save_seg_32(vcpu, buf, i); + + put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); + + /* revision id */ + put_smstate(u32, buf, 0x7efc, 0x00020000); + put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); +} + +static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) +{ +#ifdef CONFIG_X86_64 + struct desc_ptr dt; + struct kvm_segment seg; + unsigned long val; + int i; + + for (i = 0; i < 16; i++) + put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i)); + + put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); + put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); + + kvm_get_dr(vcpu, 6, &val); + put_smstate(u64, buf, 0x7f68, val); + kvm_get_dr(vcpu, 7, &val); + put_smstate(u64, buf, 0x7f60, val); + + put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); + put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); + put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); + + put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase); + + /* revision id */ + put_smstate(u32, buf, 0x7efc, 0x00020064); + + put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); + put_smstate(u16, buf, 0x7e90, seg.selector); + put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u32, buf, 0x7e94, seg.limit); + put_smstate(u64, buf, 0x7e98, seg.base); + + kvm_x86_ops->get_idt(vcpu, &dt); + put_smstate(u32, buf, 0x7e84, dt.size); + put_smstate(u64, buf, 0x7e88, dt.address); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); + put_smstate(u16, buf, 0x7e70, seg.selector); + put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u32, buf, 0x7e74, seg.limit); + put_smstate(u64, buf, 0x7e78, seg.base); + + kvm_x86_ops->get_gdt(vcpu, &dt); + put_smstate(u32, buf, 0x7e64, dt.size); + put_smstate(u64, buf, 0x7e68, dt.address); + + for (i = 0; i < 6; i++) + process_smi_save_seg_64(vcpu, buf, i); +#else + WARN_ON_ONCE(1); +#endif +} + static void process_smi(struct kvm_vcpu *vcpu) { + struct kvm_segment cs, ds; + char buf[512]; + u32 cr0; + if (is_smm(vcpu)) { vcpu->arch.smi_pending = true; return; } - printk_once(KERN_DEBUG "Ignoring guest SMI\n"); + trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); + vcpu->arch.hflags |= HF_SMM_MASK; + memset(buf, 0, 512); + if (guest_cpuid_has_longmode(vcpu)) + process_smi_save_state_64(vcpu, buf); + else + process_smi_save_state_32(vcpu, buf); + + kvm_write_guest(vcpu->kvm, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); + + if (kvm_x86_ops->get_nmi_mask(vcpu)) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + kvm_x86_ops->set_nmi_mask(vcpu, true); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0x8000); + + cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; + + kvm_x86_ops->set_cr4(vcpu, 0); + + __kvm_set_dr(vcpu, 7, DR7_FIXED_1); + + cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; + cs.base = vcpu->arch.smbase; + + ds.selector = 0; + ds.base = 0; + + cs.limit = ds.limit = 0xffffffff; + cs.type = ds.type = 0x3; + cs.dpl = ds.dpl = 0; + cs.db = ds.db = 0; + cs.s = ds.s = 1; + cs.l = ds.l = 0; + cs.g = ds.g = 1; + cs.avl = ds.avl = 0; + cs.present = ds.present = 1; + cs.unusable = ds.unusable = 0; + cs.padding = ds.padding = 0; + + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); + kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); + + if (guest_cpuid_has_longmode(vcpu)) + kvm_x86_ops->set_efer(vcpu, 0); + + kvm_update_cpuid(vcpu); + kvm_mmu_reset_context(vcpu); } static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) -- cgit v1.2.1 From cf991de2f614f454b3cb2a300c06ecdf69f3a70d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 4 Jun 2015 17:13:44 -0700 Subject: x86/asm/msr: Make wrmsrl_safe() a function The wrmsrl_safe macro performs invalid shifts if the value argument is 32 bits. This makes it unnecessarily awkward to write code that puts an unsigned long into an MSR. Convert it to a real inline function. For inspiration, see: 7c74d5b7b7b6 ("x86/asm/entry/64: Fix MSR_IA32_SYSENTER_CS MSR value"). Signed-off-by: Andy Lutomirski Cc: Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner [ Applied small improvements. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index de36f22eb0b9..13bf576c401d 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -205,8 +205,13 @@ do { \ #endif /* !CONFIG_PARAVIRT */ -#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \ - (u32)((val) >> 32)) +/* + * 64-bit version of wrmsr_safe(): + */ +static inline int wrmsrl_safe(u32 msr, u64 val) +{ + return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); +} #define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high)) -- cgit v1.2.1 From 5ca6f70f387b4f82903037cc3c5488e2c97dcdbc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 4 Jun 2015 13:24:29 -0700 Subject: x86/asm/entry/64: Remove pointless jump to irq_return INTERRUPT_RETURN turns into a jmp instruction. There's no need for extra indirection. Signed-off-by: Andy Lutomirski Cc: Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2f2318653dbad284a59311f13f08cea71298fd7c.1433449436.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f7380ea75777..31770662b940 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -663,8 +663,6 @@ retint_kernel: restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 - -irq_return: INTERRUPT_RETURN ENTRY(native_iret) @@ -1432,7 +1430,7 @@ nmi_restore: /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) - jmp irq_return + INTERRUPT_RETURN END(nmi) ENTRY(ignore_sysret) -- cgit v1.2.1 From 61b1e3e782d6784b714c0d80de529e0737d0e79c Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 2 Jun 2015 19:35:10 +0200 Subject: x86/asm/entry/32: Simplify the zeroing of pt_regs->r8..r11 in the int80 code path 32-bit syscall entry points do not save the complete pt_regs struct, they leave some fields uninitialized. However, they must be careful to not leak uninitialized data in pt_regs->r8..r11 to ptrace users. CLEAR_RREGS macro is used to zero these fields out when needed. However, in the int80 code path this zeroing is unconditional. This patch simplifies it by storing zeroes there right away, when pt_regs is constructed on stack. This uses shorter instructions: text data bss dec hex filename 1423 0 0 1423 58f ia32entry.o.before 1407 0 0 1407 57f ia32entry.o Compile-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433266510-2938-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index f16767417385..f00a409dc403 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -421,6 +421,10 @@ ia32_badarg: movq $-EFAULT,%rax jmp ia32_sysret +ia32_ret_from_sys_call: + CLEAR_RREGS + jmp int_ret_from_sys_call + /* * Emulated IA32 system calls via int 0x80. * @@ -462,8 +466,12 @@ ENTRY(ia32_syscall) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r10 */ + pushq $0 /* pt_regs->r11 */ cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -481,13 +489,10 @@ ia32_do_call: ia32_sysret: movq %rax,RAX(%rsp) 1: -ia32_ret_from_sys_call: - CLEAR_RREGS jmp int_ret_from_sys_call ia32_tracesys: SAVE_EXTRA_REGS - CLEAR_RREGS movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ -- cgit v1.2.1 From ef0cd5dc25404594f832dad9133abae52e3b2fa3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 2 Jun 2015 21:04:01 +0200 Subject: x86/asm/entry/32: Open-code CLEAR_RREGS This macro is small, has only four callsites, and one of them is slightly different using a conditional parameter. A few saved lines aren't worth the resulting obfuscation. Generated machine code is identical. Signed-off-by: Denys Vlasenko [ Added comments. ] Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433271842-9139-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index f00a409dc403..8a45d2cd2bef 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -29,15 +29,6 @@ .section .entry.text, "ax" - /* clobbers %rax */ - .macro CLEAR_RREGS _r9=rax - xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) - .endm - /* * Reload arg registers from stack in case ptrace changed them. * We don't reload %eax because syscall_trace_enter() returned @@ -243,7 +234,11 @@ sysexit_from_sys_call: TRACE_IRQS_OFF testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz \exit - CLEAR_RREGS + xorl %eax, %eax /* do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) jmp int_with_check .endm @@ -267,7 +262,11 @@ sysenter_tracesys: jz sysenter_auditsys #endif SAVE_EXTRA_REGS - CLEAR_RREGS + xorl %eax, %eax /* do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ @@ -407,7 +406,11 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_EXTRA_REGS - CLEAR_RREGS r9 + xorl %eax, %eax /* do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %r9, R9(%rsp) + movq %rax, R8(%rsp) movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ @@ -422,7 +425,11 @@ ia32_badarg: jmp ia32_sysret ia32_ret_from_sys_call: - CLEAR_RREGS + xorl %eax, %eax /* do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) jmp int_ret_from_sys_call /* -- cgit v1.2.1 From 73cbf687914fd5f4ef88a42a55784fd28b7450cf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 2 Jun 2015 21:04:02 +0200 Subject: x86/asm/entry/32: Open-code LOAD_ARGS32 This macro is small, has only three callsites, and one of them is slightly different using a conditional parameter. A few saved lines aren't worth the resulting obfuscation. Generated machine code is identical. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433271842-9139-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 54 +++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 8a45d2cd2bef..56f819e99258 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -29,28 +29,6 @@ .section .entry.text, "ax" - /* - * Reload arg registers from stack in case ptrace changed them. - * We don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. Instead, we just truncate that - * value to 32 bits again as we did on entry from user mode. - * If it's a new value set by user_regset during entry tracing, - * this matches the normal truncation of the user-mode value. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - .macro LOAD_ARGS32 _r9=0 - .if \_r9 - movl R9(%rsp),%r9d - .endif - movl RCX(%rsp),%ecx - movl RDX(%rsp),%edx - movl RSI(%rsp),%esi - movl RDI(%rsp),%edi - movl %eax,%eax /* zero extension */ - .endm - - #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret32) swapgs @@ -269,7 +247,14 @@ sysenter_tracesys: movq %rax, R8(%rsp) movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS jmp sysenter_do_call ENDPROC(ia32_sysenter_target) @@ -413,7 +398,15 @@ cstar_tracesys: movq %rax, R8(%rsp) movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ + movl R9(%rsp),%r9d + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS xchgl %ebp,%r9d jmp cstar_do_call @@ -502,7 +495,18 @@ ia32_tracesys: SAVE_EXTRA_REGS movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + /* + * Reload arg registers from stack in case ptrace changed them. + * Don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. But do truncate it to 32 bits. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp ia32_do_call END(ia32_syscall) -- cgit v1.2.1 From 53e9accf0f7682d717c7b578b6e01fd297ba6630 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 3 Jun 2015 14:56:09 +0200 Subject: x86/asm/entry/32: Do not use R9 in SYSCALL32 entry point SYSENTER and SYSCALL 32-bit entry points differ in handling of arg2 and arg6. SYSENTER: * ecx arg2 * ebp user stack * 0(%ebp) arg6 SYSCALL: * ebp arg2 * esp user stack * 0(%esp) arg6 Sysenter code loads 0(%ebp) to %ebp right away. (This destroys %ebp. It means we do not preserve it on return. It's not causing problems since userspace VDSO code does not depend on it, and SYSENTER insn can't be sanely used outside of VDSO). Syscall code loads 0(%ebp) to %r9. This allows to eliminate one MOV insn (r9 is a register where arg6 should be for 64-bit ABI), but on audit/ptrace code paths this requires juggling of r9 and ebp: (1) ptrace expects arg6 to be in pt_regs->bp; (2) r9 is callee-clobbered register and needs to be saved/restored around calls to C functions. This patch changes syscall code to load 0(%ebp) to %ebp, making it more similar to sysenter code. It's a bit smaller: text data bss dec hex filename 1407 0 0 1407 57f ia32entry.o.before 1391 0 0 1391 56f ia32entry.o To preserve ABI compat, we restore ebp on exit. Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433336169-18964-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 56f819e99258..63219154087e 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -323,7 +323,7 @@ ENTRY(ia32_cstar_target) * 32bit zero extended */ ASM_STAC -1: movl (%r8),%r9d +1: movl (%r8),%ebp _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) @@ -333,7 +333,7 @@ ENTRY(ia32_cstar_target) cstar_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ + movl %ebp,%r9d /* arg6 */ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ movl %ebx,%edi /* arg1 */ movl %edx,%edx /* arg3 (zero extension) */ @@ -349,6 +349,7 @@ cstar_dispatch: jnz sysretl_audit sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RCX(%rsp), %ebp RESTORE_RSI_RDI_RDX movl RIP(%rsp),%ecx movl EFLAGS(%rsp),%r11d @@ -375,9 +376,8 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: - movl %r9d,R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common - movl R9(%rsp),%r9d /* reload 6th syscall arg */ + movl %ebp, %r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -389,16 +389,14 @@ cstar_tracesys: testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif - xchgl %r9d,%ebp SAVE_EXTRA_REGS xorl %eax, %eax /* do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) - movq %r9, R9(%rsp) + movq %rax, R9(%rsp) movq %rax, R8(%rsp) - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - movl R9(%rsp),%r9d + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter /* Reload arg registers from stack. (see sysenter_tracesys) */ movl RCX(%rsp), %ecx @@ -408,8 +406,7 @@ cstar_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS - xchgl %ebp,%r9d - jmp cstar_do_call + jmp cstar_do_call END(ia32_cstar_target) ia32_badarg: -- cgit v1.2.1 From 54ad726c51b7f7dffcc1dc379bedadee19f742f7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2015 13:02:28 +0200 Subject: x86/asm/entry/32: Improve code readability Make the 64-bit compat 32-bit syscall entry code a bit more readable: - eliminate whitespace noise - use consistent vertical spacing - use consistent assembly coding style similar to entry_64.S - fix various comments No code changed: arch/x86/entry/ia32entry.o: text data bss dec hex filename 1391 0 0 1391 56f ia32entry.o.before 1391 0 0 1391 56f ia32entry.o.after md5: f28501dcc366e68b557313942c6496d6 ia32entry.o.before.asm f28501dcc366e68b557313942c6496d6 ia32entry.o.after.asm Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 287 +++++++++++++++++++++++---------------------- 1 file changed, 146 insertions(+), 141 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 63219154087e..4bb9f7b2bc9c 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -1,15 +1,14 @@ /* - * Compatibility mode system call entry point for x86-64. - * + * Compatibility mode system call entry point for x86-64. + * * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ - + */ #include "calling.h" #include #include #include -#include -#include +#include +#include #include #include #include @@ -20,11 +19,11 @@ /* Avoid __ASSEMBLER__'ifying just for this. */ #include #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 +#define __AUDIT_ARCH_LE 0x40000000 #ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit ia32_ret_from_sys_call -#define sysretl_audit ia32_ret_from_sys_call +# define sysexit_audit ia32_ret_from_sys_call +# define sysretl_audit ia32_ret_from_sys_call #endif .section .entry.text, "ax" @@ -37,7 +36,7 @@ ENDPROC(native_usergs_sysret32) #endif /* - * 32bit SYSENTER instruction entry. + * 32-bit SYSENTER instruction entry. * * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. * IF and VM in rflags are cleared (IOW: interrupts are off). @@ -79,7 +78,7 @@ ENTRY(ia32_sysenter_target) pushq %rbp /* pt_regs->sp */ pushfq /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -87,15 +86,15 @@ ENTRY(ia32_sysenter_target) pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ /* * no need to do an access_ok check here because rbp has been - * 32bit zero extended + * 32-bit zero extended */ ASM_STAC -1: movl (%rbp),%ebp - _ASM_EXTABLE(1b,ia32_badarg) +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC /* @@ -103,26 +102,26 @@ ENTRY(ia32_sysenter_target) * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. */ - testl $X86_EFLAGS_NT,EFLAGS(%rsp) - jnz sysenter_fix_flags + testl $X86_EFLAGS_NT, EFLAGS(%rsp) + jnz sysenter_fix_flags sysenter_flags_fixed: orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys + jnz sysenter_tracesys sysenter_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax + cmpq $(IA32_NR_syscalls-1), %rax ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -139,21 +138,21 @@ sysexit_from_sys_call: * This code path is still called 'sysexit' because it pairs * with 'sysenter' and it uses the SYSENTER calling convention. */ - andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp),%ecx /* User %eip */ + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp), %ecx /* User %eip */ RESTORE_RSI_RDI - xorl %edx,%edx /* avoid info leaks */ - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - movl EFLAGS(%rsp),%r11d /* User eflags */ + xorl %edx, %edx /* Do not leak kernel information */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movl EFLAGS(%rsp), %r11d /* User eflags */ TRACE_IRQS_ON /* * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, * since it avoids a dicey window with interrupts enabled. */ - movl RSP(%rsp),%esp + movl RSP(%rsp), %esp /* * USERGS_SYSRET32 does: @@ -179,51 +178,51 @@ sysexit_from_sys_call: #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common - movl %esi,%r8d /* 5th arg: 4th syscall arg */ - movl %ecx,%r9d /*swap with edx*/ - movl %edx,%ecx /* 4th arg: 3rd syscall arg */ - movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ - movl %ebx,%esi /* 2nd arg: 1st syscall arg */ - movl %eax,%edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl ORIG_RAX(%rsp),%eax /* reload syscall number */ - movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI(%rsp),%r8d /* reload 5th syscall arg */ + movl %esi, %r8d /* 5th arg: 4th syscall arg */ + movl %ecx, %r9d /* swap with edx */ + movl %edx, %ecx /* 4th arg: 3rd syscall arg */ + movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ + movl %ebx, %esi /* 2nd arg: 1st syscall arg */ + movl %eax, %edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl ORIG_RAX(%rsp), %eax /* reload syscall number */ + movl %ebx, %edi /* reload 1st syscall arg */ + movl RCX(%rsp), %esi /* reload 2nd syscall arg */ + movl RDX(%rsp), %edx /* reload 3rd syscall arg */ + movl RSI(%rsp), %ecx /* reload 4th syscall arg */ + movl RDI(%rsp), %r8d /* reload 5th syscall arg */ .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp),%rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + movl %eax, %esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al, %edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp), %rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - xorl %eax, %eax /* do not leak kernel information */ + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - jmp int_with_check + jmp int_with_check .endm sysenter_auditsys: auditsys_entry_common - movl %ebp,%r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch sysexit_audit: auditsys_exit sysexit_from_sys_call @@ -232,7 +231,7 @@ sysexit_audit: sysenter_fix_flags: pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) popfq - jmp sysenter_flags_fixed + jmp sysenter_flags_fixed sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL @@ -240,12 +239,12 @@ sysenter_tracesys: jz sysenter_auditsys #endif SAVE_EXTRA_REGS - xorl %eax, %eax /* do not leak kernel information */ + xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - movq %rsp,%rdi /* &pt_regs -> arg1 */ + movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter /* Reload arg registers from stack. (see sysenter_tracesys) */ @@ -253,23 +252,23 @@ sysenter_tracesys: movl RDX(%rsp), %edx movl RSI(%rsp), %esi movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ + movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp sysenter_do_call ENDPROC(ia32_sysenter_target) /* - * 32bit SYSCALL instruction entry. + * 32-bit SYSCALL instruction entry. * - * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC * are not needed). SYSCALL does not save anything on the stack * and does not change rsp. * * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). * Don't get confused: rflags saving+masking depends on Long Mode Active bit * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). @@ -296,12 +295,12 @@ ENTRY(ia32_cstar_target) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK - movl %esp,%r8d - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + movl %esp, %r8d + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax + movl %eax, %eax /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ @@ -314,55 +313,58 @@ ENTRY(ia32_cstar_target) pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ pushq %rbp /* pt_regs->cx */ - movl %ebp,%ecx + movl %ebp, %ecx pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ /* - * no need to do an access_ok check here because r8 has been - * 32bit zero extended + * No need to do an access_ok check here because r8 has been + * 32-bit zero extended: */ ASM_STAC -1: movl (%r8),%ebp - _ASM_EXTABLE(1b,ia32_badarg) +1: movl (%r8), %ebp + _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz cstar_tracesys cstar_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cstar_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax + cmpq $(IA32_NR_syscalls-1), %rax ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit + sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RCX(%rsp), %ebp RESTORE_RSI_RDI_RDX - movl RIP(%rsp),%ecx - movl EFLAGS(%rsp),%r11d - xorq %r10,%r10 - xorq %r9,%r9 - xorq %r8,%r8 + movl RIP(%rsp), %ecx + movl EFLAGS(%rsp), %r11d + xorq %r10, %r10 + xorq %r9, %r9 + xorq %r8, %r8 TRACE_IRQS_ON - movl RSP(%rsp),%esp + movl RSP(%rsp), %esp /* - * 64bit->32bit SYSRET restores eip from ecx, + * 64-bit->32-bit SYSRET restores eip from ecx, * eflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. - * (Note: 32bit->32bit SYSRET is different: since r11 + * (Note: 32-bit->32-bit SYSRET is different: since r11 * does not exist, it merely sets eflags.IF=1). * * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss @@ -377,8 +379,8 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp cstar_dispatch + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp cstar_dispatch sysretl_audit: auditsys_exit sysretl_from_sys_call @@ -386,16 +388,16 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys #endif SAVE_EXTRA_REGS - xorl %eax, %eax /* do not leak kernel information */ + xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ + movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter /* Reload arg registers from stack. (see sysenter_tracesys) */ @@ -403,24 +405,24 @@ cstar_tracesys: movl RDX(%rsp), %edx movl RSI(%rsp), %esi movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ + movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp cstar_do_call END(ia32_cstar_target) - + ia32_badarg: ASM_CLAC - movq $-EFAULT,%rax - jmp ia32_sysret + movq $-EFAULT, %rax + jmp ia32_sysret ia32_ret_from_sys_call: - xorl %eax, %eax /* do not leak kernel information */ + xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - jmp int_ret_from_sys_call + jmp int_ret_from_sys_call /* * Emulated IA32 system calls via int 0x80. @@ -454,7 +456,7 @@ ENTRY(ia32_syscall) ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax + movl %eax, %eax /* Construct struct pt_regs on stack (iret frame is already on stack) */ pushq %rax /* pt_regs->orig_ax */ @@ -468,30 +470,33 @@ ENTRY(ia32_syscall) pushq $0 /* pt_regs->r10 */ pushq $0 /* pt_regs->r11 */ cld - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys ia32_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1),%rax + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1), %rax ja 1f - call *ia32_sys_call_table(,%rax,8) # xxx: rip relative + + call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ + ia32_sysret: - movq %rax,RAX(%rsp) + movq %rax, RAX(%rsp) 1: - jmp int_ret_from_sys_call + jmp int_ret_from_sys_call ia32_tracesys: SAVE_EXTRA_REGS - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter /* * Reload arg registers from stack in case ptrace changed them. * Don't reload %eax because syscall_trace_enter() returned @@ -503,33 +508,33 @@ ia32_tracesys: movl RDX(%rsp), %edx movl RSI(%rsp), %esi movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ + movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS - jmp ia32_do_call + jmp ia32_do_call END(ia32_syscall) .macro PTREGSCALL label, func ALIGN GLOBAL(\label) - leaq \func(%rip),%rax - jmp ia32_ptregs_common + leaq \func(%rip), %rax + jmp ia32_ptregs_common .endm - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork ALIGN GLOBAL(stub32_clone) - leaq sys_clone(%rip),%rax + leaq sys_clone(%rip), %rax mov %r8, %rcx - jmp ia32_ptregs_common + jmp ia32_ptregs_common ALIGN ia32_ptregs_common: SAVE_EXTRA_REGS 8 - call *%rax + call *%rax RESTORE_EXTRA_REGS 8 ret END(ia32_ptregs_common) -- cgit v1.2.1 From 5cdc683b7d8b3341a3d18e0c5498bc1e4f3fb990 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 3 Jun 2015 15:58:49 +0200 Subject: x86/asm/entry/32: Explain the stub32_clone logic The reason for copying of %r8 to %rcx is quite non-obvious. Add a comment which explains why it is done. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433339930-20880-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 4bb9f7b2bc9c..d0c7b28d5670 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -528,6 +528,14 @@ GLOBAL(\label) ALIGN GLOBAL(stub32_clone) leaq sys_clone(%rip), %rax + /* + * 32-bit clone API is clone(..., int tls_val, int *child_tidptr). + * 64-bit clone API is clone(..., int *child_tidptr, int tls_val). + * Native 64-bit kernel's sys_clone() implements the latter. + * We need to swap args here. But since tls_val is in fact ignored + * by sys_clone(), we can get away with an assignment + * (arg4 = arg5) instead of a full swap: + */ mov %r8, %rcx jmp ia32_ptregs_common -- cgit v1.2.1 From 7a5a9824c18f93415944c997dc6bb8eecfddd2e7 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 3 Jun 2015 15:58:50 +0200 Subject: x86/asm/entry/32: Remove unnecessary optimization in stub32_clone Really swap arguments #4 and #5 in stub32_clone instead of "optimizing" it into a move. Yes, tls_val is currently unused. Yes, on some CPUs XCHG is a little bit more expensive than MOV. But a cycle or two on an expensive syscall like clone() is way below noise floor, and this optimization is simply not worth the obfuscation of logic. [ There's also ongoing work on the clone() ABI by Josh Triplett that will depend on this change later on. ] Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Josh Triplett Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433339930-20880-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index d0c7b28d5670..9558dacf32b9 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -529,14 +529,13 @@ GLOBAL(\label) GLOBAL(stub32_clone) leaq sys_clone(%rip), %rax /* - * 32-bit clone API is clone(..., int tls_val, int *child_tidptr). - * 64-bit clone API is clone(..., int *child_tidptr, int tls_val). - * Native 64-bit kernel's sys_clone() implements the latter. - * We need to swap args here. But since tls_val is in fact ignored - * by sys_clone(), we can get away with an assignment - * (arg4 = arg5) instead of a full swap: + * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). + * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). + * + * The native 64-bit kernel's sys_clone() implements the latter, + * so we need to swap arguments here before calling it: */ - mov %r8, %rcx + xchg %r8, %rcx jmp ia32_ptregs_common ALIGN -- cgit v1.2.1 From f2af7d25b4aa5b01203cc76e7530ea7fd18864a0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 4 Jun 2015 14:18:49 +0800 Subject: x86/boot/setup: Clean up the e820_reserve_setup_data() code Deobfuscate the 'found' logic, it can be replaced with a simple: if (!pa_data) return; and 'found' can be eliminated. Signed-off-by: Wei Yang Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1433398729-8314-1-git-send-email-weiyang@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d74ac33290ae..c3992c3a4378 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -461,19 +461,18 @@ static void __init e820_reserve_setup_data(void) { struct setup_data *data; u64 pa_data; - int found = 0; pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return; + while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); - found = 1; pa_data = data->next; early_memunmap(data, sizeof(*data)); } - if (!found) - return; sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); memcpy(&e820_saved, &e820, sizeof(struct e820map)); -- cgit v1.2.1 From e4cd1da944ed9d2acd2e4ccabf61ec443735f6db Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2015 15:11:46 +0200 Subject: KVM: x86: pass struct kvm_mmu_page to gfn_to_rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is always available (with one exception in the auditing code), and with the same auditing exception the level was coming from sp->role.level. Later, the spte's role will also be used to look up the right memslots array. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 10 +++++----- arch/x86/kvm/mmu_audit.c | 8 ++++++-- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a65ce12470f8..0d01cbbcf3eb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1043,12 +1043,12 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, /* * Take gfn and return the reverse mapping to it. */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) { struct kvm_memory_slot *slot; slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(gfn, level, slot); + return __gfn_to_rmap(gfn, sp->role.level, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1066,7 +1066,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmapp); } @@ -1078,7 +1078,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, sp); pte_list_remove(spte, rmapp); } @@ -1612,7 +1612,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 368d53497314..9d99f17aa3be 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -146,7 +146,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, rev_sp); if (!*rmapp) { if (!__ratelimit(&ratelimit_state)) return; @@ -191,11 +191,15 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) unsigned long *rmapp; u64 *sptep; struct rmap_iterator iter; + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; if (sp->role.direct || sp->unsync || sp->role.invalid) return; - rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); + slots = kvm_memslots(kvm); + slot = __gfn_to_memslot(slots, sp->gfn); + rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); for_each_rmap_spte(rmapp, &iter, sptep) if (is_writable_pte(*sptep)) -- cgit v1.2.1 From 54bf36aac520315385fe7623a5c3a698e993ceda Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Apr 2015 15:39:23 +0200 Subject: KVM: x86: use vcpu-specific functions to read/write/translate GFNs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to hide SMRAM from guests not running in SMM. Therefore, all uses of kvm_read_guest* and kvm_write_guest* must be changed to check whether the VCPU is in system management mode and use a different set of memslots. Switch from kvm_* to the newly-introduced kvm_vcpu_*, which call into kvm_arch_vcpu_memslots_id. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 62 ++++++++++++++++++++--------------------- arch/x86/kvm/mmu_audit.c | 2 +- arch/x86/kvm/paging_tmpl.h | 18 ++++++------ arch/x86/kvm/svm.c | 12 ++++---- arch/x86/kvm/vmx.c | 32 ++++++++++----------- arch/x86/kvm/x86.c | 32 ++++++++++----------- 7 files changed, 80 insertions(+), 80 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 12a7318887ad..2fd420255c2f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -887,7 +887,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0d01cbbcf3eb..3814f483ac45 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte) return gen; } -static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) { - return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; + return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; } -static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, +static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - unsigned int gen = kvm_current_mmio_generation(kvm); + unsigned int gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; @@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte) return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(kvm, sptep, gfn, access); + mark_mmio_spte(vcpu, sptep, gfn, access); return true; } return false; } -static bool check_mmio_spte(struct kvm *kvm, u64 spte) +static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { unsigned int kvm_gen, spte_gen; - kvm_gen = kvm_current_mmio_generation(kvm); + kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); @@ -837,14 +837,14 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm *kvm, +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = gfn_to_memslot(kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -876,7 +876,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(vcpu->kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID || (no_dirty_log && slot->dirty_bitmap)) slot = NULL; @@ -901,7 +901,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + if (has_wrprotected_page(vcpu, large_gfn, level)) break; return level - 1; @@ -1336,18 +1336,18 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static bool rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; bool write_protected = false; - slot = gfn_to_memslot(kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, true); + write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true); } return write_protected; @@ -2032,7 +2032,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, bool protected = false; for_each_sp(pages, sp, parents, i) - protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + protected |= rmap_write_protect(vcpu, sp->gfn); if (protected) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2130,7 +2130,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_add_head(&sp->hash_link, &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { - if (rmap_write_protect(vcpu->kvm, gfn)) + if (rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); if (level > PT_PAGE_TABLE_LEVEL && need_sync) kvm_sync_pages(vcpu, gfn); @@ -2581,7 +2581,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte; int ret = 0; - if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -2618,7 +2618,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * be fixed if guest refault. */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) + has_wrprotected_page(vcpu, gfn, level)) goto done; spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; @@ -2642,7 +2642,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } if (pte_access & ACC_WRITE_MASK) { - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); spte |= shadow_dirty_mask; } @@ -2860,7 +2860,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) return 1; if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); + kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); return 0; } @@ -2883,7 +2883,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the @@ -2975,7 +2975,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * Compare with set_spte where instead shadow_dirty_mask is set. */ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); return true; } @@ -3430,7 +3430,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); - if (!check_mmio_spte(vcpu->kvm, spte)) + if (!check_mmio_spte(vcpu, spte)) return RET_MMIO_PF_INVALID; if (direct) @@ -3502,7 +3502,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); + return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) @@ -3520,7 +3520,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, struct kvm_memory_slot *slot; bool async; - slot = gfn_to_memslot(vcpu->kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); async = false; *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) @@ -3633,7 +3633,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, vcpu->arch.mmu.inject_page_fault(vcpu, fault); } -static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { @@ -3643,7 +3643,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, } (*nr_present)++; - mark_mmio_spte(kvm, sptep, gfn, access); + mark_mmio_spte(vcpu, sptep, gfn, access); return true; } @@ -4153,7 +4153,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); + r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); if (r) gentry = 0; new = (const u8 *)&gentry; @@ -4779,13 +4779,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) { /* * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { + if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 9d99f17aa3be..78288c15400c 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -114,7 +114,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) return; gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn); if (is_error_pfn(pfn)) return; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6e6d115fe9b5..0f67d7e24800 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -256,7 +256,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (ret) return ret; - mark_page_dirty(vcpu->kvm, table_gfn); + kvm_vcpu_mark_page_dirty(vcpu, table_gfn); walker->ptes[level] = pte; } return 0; @@ -338,7 +338,7 @@ retry_walk: real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, + host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn, &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -511,11 +511,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, base_gpa = pte_gpa & ~mask; index = (pte_gpa - base_gpa) / sizeof(pt_element_t); - r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, + r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); curr_pte = gw->prefetch_ptes[index]; } else - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, + r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &curr_pte, sizeof(curr_pte)); return r || curr_pte != gw->ptes[level - 1]; @@ -869,8 +869,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (!rmap_can_add(vcpu)) break; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) break; FNAME(update_pte)(vcpu, sp, sptep, &gpte); @@ -956,8 +956,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { @@ -970,7 +970,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access &= FNAME(gpte_access)(vcpu, gpte); FNAME(protect_clean_gpte)(&pte_access, gpte); - if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) continue; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c48748cf638e..6ff1faf4a2e8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1955,8 +1955,8 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; - ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, - offset_in_page(cr3) + index * 8, 8); + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, + offset_in_page(cr3) + index * 8, 8); if (ret) return 0; return pdpte; @@ -2114,7 +2114,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) might_sleep(); - page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto error; @@ -2153,7 +2153,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) mask = (0xf >> (4 - size)) << start_bit; val = 0; - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len)) + if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len)) return NESTED_EXIT_DONE; return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; @@ -2178,7 +2178,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) /* Offset is in 32 bit units but need in 8 bit units */ offset *= 4; - if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) return NESTED_EXIT_DONE; return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; @@ -2449,7 +2449,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) p = msrpm_offsets[i]; offset = svm->nested.vmcb_msrpm + (p * 4); - if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) return false; svm->nested.msrpm[p] = svm->msrpm[p] | value; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b5b7b8ab2f59..8c80b7d7343c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -786,7 +786,7 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) { - struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); + struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); if (is_error_page(page)) return NULL; @@ -7323,7 +7323,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, bitmap += (port & 0x7fff) / 8; if (last_bitmap != bitmap) - if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) + if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) return true; if (b & (1 << (port & 7))) return true; @@ -7367,7 +7367,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, /* Then read the msr_index'th bit from this bitmap: */ if (msr_index < 1024*8) { unsigned char b; - if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) + if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) return true; return 1 & (b >> (msr_index & 7)); } else @@ -7632,9 +7632,9 @@ static void vmx_disable_pml(struct vcpu_vmx *vmx) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } -static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) +static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vmx->vcpu.kvm; + struct vcpu_vmx *vmx = to_vmx(vcpu); u64 *pml_buf; u16 pml_idx; @@ -7656,7 +7656,7 @@ static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) gpa = pml_buf[pml_idx]; WARN_ON(gpa & (PAGE_SIZE - 1)); - mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } /* reset PML index */ @@ -7851,7 +7851,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) * flushed already. */ if (enable_pml) - vmx_flush_pml_buffer(vmx); + vmx_flush_pml_buffer(vcpu); /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -9109,8 +9109,8 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr.host_initiated = false; for (i = 0; i < count; i++) { - if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), - &e, sizeof(e))) { + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), + &e, sizeof(e))) { pr_warn_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); @@ -9143,9 +9143,9 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) for (i = 0; i < count; i++) { struct msr_data msr_info; - if (kvm_read_guest(vcpu->kvm, - gpa + i * sizeof(e), - &e, 2 * sizeof(u32))) { + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { pr_warn_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); @@ -9165,10 +9165,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index); return -EINVAL; } - if (kvm_write_guest(vcpu->kvm, - gpa + i * sizeof(e) + - offsetof(struct vmx_msr_entry, value), - &msr_info.data, sizeof(msr_info.data))) { + if (kvm_vcpu_write_guest(vcpu, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &msr_info.data, sizeof(msr_info.data))) { pr_warn_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, msr_info.data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51d994e1d6af..a510f135180a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -478,7 +478,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr); /* * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_read_guest_page is that this function + * running guest. The difference to kvm_vcpu_read_guest_page is that this function * can read from guest physical or from the guest's guest physical memory. */ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, @@ -496,7 +496,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, real_gfn = gpa_to_gfn(real_gfn); - return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); + return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); @@ -2030,7 +2030,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) r = PTR_ERR(page); goto out; } - if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) goto out_free; r = 0; out_free: @@ -2130,13 +2130,13 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; } gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(vcpu->kvm, gfn); + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; if (__clear_user((void __user *)addr, PAGE_SIZE)) return 1; vcpu->arch.hv_vapic = data; - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) return 1; break; @@ -4425,8 +4425,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data, - offset, toread); + ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, + offset, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4459,8 +4459,8 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, offset = addr & (PAGE_SIZE-1); if (WARN_ON(offset + bytes > PAGE_SIZE)) bytes = (unsigned)PAGE_SIZE - offset; - ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val, - offset, bytes); + ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val, + offset, bytes); if (unlikely(ret < 0)) return X86EMUL_IO_NEEDED; @@ -4506,7 +4506,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); + ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4559,7 +4559,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { int ret; - ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes); if (ret < 0) return 0; kvm_mmu_pte_write(vcpu, gpa, val, bytes); @@ -4593,7 +4593,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); + return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes); } static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4791,7 +4791,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) goto emul_write; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto emul_write; @@ -4819,7 +4819,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; - mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -6570,7 +6570,7 @@ static void process_smi(struct kvm_vcpu *vcpu) else process_smi_save_state_32(vcpu, buf); - kvm_write_guest(vcpu->kvm, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); + kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); if (kvm_x86_ops->get_nmi_mask(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; @@ -8075,7 +8075,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ - kvm_mmu_invalidate_mmio_sptes(kvm); + kvm_mmu_invalidate_mmio_sptes(kvm, slots); } int kvm_arch_prepare_memory_region(struct kvm *kvm, -- cgit v1.2.1 From 9da0e4d5ac969909f6b435ce28ea28135a9cbd69 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2015 13:33:16 +0200 Subject: KVM: x86: work on all available address spaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch has no semantic change, but it prepares for the introduction of a second address space for system management mode. A new function x86_set_memory_region (and the "slots_lock taken" counterpart __x86_set_memory_region) is introduced in order to operate on all address spaces when adding or deleting private memory slots. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++ arch/x86/kvm/mmu.c | 84 ++++++++++++++++++++++------------------- arch/x86/kvm/vmx.c | 6 +-- arch/x86/kvm/x86.c | 40 ++++++++++++++++++-- 4 files changed, 91 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2fd420255c2f..5a5e13af6e03 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1189,4 +1189,9 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); void kvm_deliver_pmi(struct kvm_vcpu *vcpu); +int __x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem); +int x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3814f483ac45..7619e9e1745c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1503,30 +1503,33 @@ static int kvm_handle_hva_range(struct kvm *kvm, struct kvm_memory_slot *memslot; struct slot_rmap_walk_iterator iterator; int ret = 0; + int i; - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1, - &iterator) - ret |= handler(kvm, iterator.rmap, memslot, - iterator.gfn, iterator.level, data); + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, + gfn_start, gfn_end - 1, + &iterator) + ret |= handler(kvm, iterator.rmap, memslot, + iterator.gfn, iterator.level, data); + } } return ret; @@ -4536,21 +4539,23 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - - slots = kvm_memslots(kvm); + int i; spin_lock(&kvm->mmu_lock); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); + } } spin_unlock(&kvm->mmu_lock); @@ -4907,15 +4912,18 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) unsigned int nr_pages = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + int i; - slots = kvm_memslots(kvm); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) - nr_pages += memslot->npages; + kvm_for_each_memslot(memslot, slots) + nr_pages += memslot->npages; + } nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8c80b7d7343c..862fa8f2c61d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4115,7 +4115,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; @@ -4150,7 +4150,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, &kvm_userspace_mem); return r; } @@ -4956,7 +4956,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) .flags = 0, }; - ret = kvm_set_memory_region(kvm, &tss_mem); + ret = x86_set_memory_region(kvm, &tss_mem); if (ret) return ret; kvm->arch.tss_addr = addr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a510f135180a..caa0d5f8e6b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7963,6 +7963,40 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } +int __x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem) +{ + int i, r; + + /* Called with kvm->slots_lock held. */ + BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + struct kvm_userspace_memory_region m = *mem; + + m.slot |= i << 16; + r = __kvm_set_memory_region(kvm, &m); + if (r < 0) + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(__x86_set_memory_region); + +int x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem) +{ + int r; + + mutex_lock(&kvm->slots_lock); + r = __x86_set_memory_region(kvm, mem); + mutex_unlock(&kvm->slots_lock); + + return r; +} +EXPORT_SYMBOL_GPL(x86_set_memory_region); + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { @@ -7974,13 +8008,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) struct kvm_userspace_memory_region mem; memset(&mem, 0, sizeof(mem)); mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); mem.slot = TSS_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); } kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); -- cgit v1.2.1 From 699023e239658e62da6f42f47d31b54788521ec1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2015 15:03:39 +0200 Subject: KVM: x86: add SMM to the MMU role, support SMRAM address space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is now very simple to do. The only interesting part is a simple trick to find the right memslot in gfn_to_rmap, retrieving the address space from the spte role word. The same trick is used in the auditing code. The comment on top of union kvm_mmu_page_role has been stale forever, so remove it. Speaking of stale code, remove pad_for_nice_hex_output too: it was splitting the "access" bitfield across two bytes and thus had effectively turned into pad_for_ugly_hex_output. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 26 +++++++++++++++----------- arch/x86/kvm/mmu.c | 15 ++++++++++++--- arch/x86/kvm/mmu_audit.c | 10 +++++++--- arch/x86/kvm/x86.c | 2 ++ 4 files changed, 36 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a5e13af6e03..47006683f2fe 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -184,23 +184,12 @@ struct kvm_mmu_memory_cache { void *objects[KVM_NR_MEM_OBJS]; }; -/* - * kvm_mmu_page_role, below, is defined as: - * - * bits 0:3 - total guest paging levels (2-4, or zero for real mode) - * bits 4:7 - page table level for this shadow (1-4) - * bits 8:9 - page table quadrant for 2-level guests - * bit 16 - direct mapping of virtual to physical mapping at gfn - * used for real mode and two-dimensional paging - * bits 17:19 - common access permissions for all ptes in this shadow page - */ union kvm_mmu_page_role { unsigned word; struct { unsigned level:4; unsigned cr4_pae:1; unsigned quadrant:2; - unsigned pad_for_nice_hex_output:6; unsigned direct:1; unsigned access:3; unsigned invalid:1; @@ -208,6 +197,15 @@ union kvm_mmu_page_role { unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; + unsigned :8; + + /* + * This is left at the top of the word so that + * kvm_memslots_for_spte_role can extract it with a + * simple shift. While there is room, give it a whole + * byte so it is also faster to load it from memory. + */ + unsigned smm:8; }; }; @@ -1120,6 +1118,12 @@ enum { #define HF_SMM_MASK (1 << 6) #define HF_SMM_INSIDE_NMI_MASK (1 << 7) +#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +#define KVM_ADDRESS_SPACE_NUM 2 + +#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) +#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) + /* * Hardware virtualization extension instructions may fault if a * reboot turns off virtualization while processes are running. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7619e9e1745c..c88f0e443669 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -806,13 +806,15 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; gfn_t gfn; int i; gfn = sp->gfn; - slot = gfn_to_memslot(kvm, gfn); + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count += 1; @@ -822,13 +824,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; gfn_t gfn; int i; gfn = sp->gfn; - slot = gfn_to_memslot(kvm, gfn); + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count -= 1; @@ -1045,9 +1049,11 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, */ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; - slot = gfn_to_memslot(kvm, gfn); + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); return __gfn_to_rmap(gfn, sp->role.level, slot); } @@ -3924,6 +3930,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; + context->base_role.smm = is_smm(vcpu); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -3985,6 +3992,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) = smep && !is_write_protection(vcpu); context->base_role.smap_andnot_wp = smap && !is_write_protection(vcpu); + context->base_role.smm = is_smm(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -4268,6 +4276,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.nxe = 1; mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; + mask.smm = 1; /* * If we don't have indirect shadow pages, it means no page is diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 78288c15400c..a4f62e6f2db2 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -131,12 +131,16 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); unsigned long *rmapp; struct kvm_mmu_page *rev_sp; + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; gfn_t gfn; rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - if (!gfn_to_memslot(kvm, gfn)) { + slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); + slot = __gfn_to_memslot(slots, gfn); + if (!slot) { if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no memslot for gfn %llx\n", gfn); @@ -146,7 +150,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, gfn, rev_sp); + rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!*rmapp) { if (!__ratelimit(&ratelimit_state)) return; @@ -197,7 +201,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->role.direct || sp->unsync || sp->role.invalid) return; - slots = kvm_memslots(kvm); + slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index caa0d5f8e6b3..7489871b63df 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5490,6 +5490,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } } + + kvm_mmu_reset_context(vcpu); } static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) -- cgit v1.2.1 From 6d396b55203969ca61cc8f838db2e68433e13f7b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Apr 2015 14:25:33 +0200 Subject: KVM: x86: advertise KVM_CAP_X86_SMM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... and we're done. :) Because SMBASE is usually relocated above 1M on modern chipsets, and SMM handlers might indeed rely on 4G segment limits, we only expose it if KVM is able to run the guest in big real mode. This includes any of VMX+emulate_invalid_guest_state, VMX+unrestricted_guest, or SVM. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 15 +++++++++++++++ 4 files changed, 28 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 47006683f2fe..8ca32cfbcbd8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -709,6 +709,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + bool (*cpu_has_high_real_mode_segbase)(void); void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6ff1faf4a2e8..680753186489 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4080,6 +4080,11 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } +static bool svm_has_high_real_mode_segbase(void) +{ + return true; +} + static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; @@ -4353,6 +4358,7 @@ static struct kvm_x86_ops svm_x86_ops = { .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, + .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 862fa8f2c61d..06a186b07631 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8139,6 +8139,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_has_high_real_mode_segbase(void) +{ + return enable_unrestricted_guest || emulate_invalid_guest_state; +} + static bool vmx_mpx_supported(void) { return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && @@ -10296,6 +10301,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, + .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7489871b63df..43f0df7ddc9c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2900,6 +2900,17 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #endif r = 1; break; + case KVM_CAP_X86_SMM: + /* SMBASE is usually relocated above 1M on modern chipsets, + * and SMM handlers might indeed rely on 4G segment limits, + * so do not report SMM to be available if real mode is + * emulated via vm86 mode. Still, do not go to great lengths + * to avoid userspace's usage of the feature, because it is a + * fringe case that is not enabled except via specific settings + * of the module parameters. + */ + r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); + break; case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; @@ -4299,6 +4310,10 @@ static void kvm_init_msr_list(void) for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { switch (emulated_msrs[i]) { + case MSR_IA32_SMBASE: + if (!kvm_x86_ops->cpu_has_high_real_mode_segbase()) + continue; + break; default: break; } -- cgit v1.2.1 From e80a4a9426adeaa34c009bc0bc61365e0580bf01 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jun 2015 16:32:48 +0200 Subject: KVM: x86: mark legacy PCI device assignment as deprecated Follow up to commit e194bbdf362ba7d53cfd23ba24f1a7c90ef69a74. Suggested-by: Bandan Das Suggested-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a0f06a5947c5..d8a1d56276e1 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -86,7 +86,7 @@ config KVM_MMU_AUDIT auditing of KVM MMU events at runtime. config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support" + bool "KVM legacy PCI device assignment support (DEPRECATED)" depends on KVM && PCI && IOMMU_API default n ---help--- -- cgit v1.2.1 From 138bd56a2164c65f68042ba580d88ab70c036fd1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2015 14:11:49 +0200 Subject: x86/asm/entry/64/compat: Rename ia32entry.S -> entry_64_compat.S So we now have the following system entry code related files, which define the following system call instruction and other entry paths: entry_32.S # 32-bit binaries on 32-bit kernels entry_64.S # 64-bit binaries on 64-bit kernels entry_64_compat.S # 32-bit binaries on 64-bit kernels Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Josh Triplett Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 2 +- arch/x86/entry/entry_64.S | 2 +- arch/x86/entry/entry_64_compat.S | 547 +++++++++++++++++++++++++++++++++++++++ arch/x86/entry/ia32entry.S | 547 --------------------------------------- 4 files changed, 549 insertions(+), 549 deletions(-) create mode 100644 arch/x86/entry/entry_64_compat.S delete mode 100644 arch/x86/entry/ia32entry.S (limited to 'arch/x86') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index b93cce1c6bb2..7a144971db79 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -6,5 +6,5 @@ obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += vdso/ obj-y += vsyscall/ -obj-$(CONFIG_IA32_EMULATION) += ia32entry.o syscall_32.o +obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 31770662b940..4cf3dd36aa0d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -516,7 +516,7 @@ ENTRY(ret_from_fork) /* * By the time we get here, we have no idea whether our pt_regs, * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. + * the slow path, or one of the 32-bit compat paths. * Use IRET code path to return, since it can safely handle * all of the above. */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S new file mode 100644 index 000000000000..9558dacf32b9 --- /dev/null +++ b/arch/x86/entry/entry_64_compat.S @@ -0,0 +1,547 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ +#include "calling.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +# define sysexit_audit ia32_ret_from_sys_call +# define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) +#endif + +/* + * 32-bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * no need to do an access_ok check here because rbp has been + * 32-bit zero extended + */ + ASM_STAC +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT, EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysenter_tracesys + +sysenter_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ +sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp), %ecx /* User %eip */ + RESTORE_RSI_RDI + xorl %edx, %edx /* Do not leak kernel information */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movl EFLAGS(%rsp), %r11d /* User eflags */ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp), %esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi, %r8d /* 5th arg: 4th syscall arg */ + movl %ecx, %r9d /* swap with edx */ + movl %edx, %ecx /* 4th arg: 3rd syscall arg */ + movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ + movl %ebx, %esi /* 2nd arg: 1st syscall arg */ + movl %eax, %edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl ORIG_RAX(%rsp), %eax /* reload syscall number */ + movl %ebx, %edi /* reload 1st syscall arg */ + movl RCX(%rsp), %esi /* reload 2nd syscall arg */ + movl RDX(%rsp), %edx /* reload 3rd syscall arg */ + movl RSI(%rsp), %ecx /* reload 4th syscall arg */ + movl RDI(%rsp), %r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax, %esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al, %edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp), %rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp sysenter_do_call +ENDPROC(ia32_sysenter_target) + +/* + * 32-bit SYSCALL instruction entry. + * + * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp, %r8d + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ + movl %ebp, %ecx + pushq $-ENOSYS /* pt_regs->ax */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * No need to do an access_ok check here because r8 has been + * 32-bit zero extended: + */ + ASM_STAC +1: movl (%r8), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys + +cstar_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + +cstar_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit + +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RCX(%rsp), %ebp + RESTORE_RSI_RDI_RDX + movl RIP(%rsp), %ecx + movl EFLAGS(%rsp), %r11d + xorq %r10, %r10 + xorq %r9, %r9 + xorq %r8, %r8 + TRACE_IRQS_ON + movl RSP(%rsp), %esp + /* + * 64-bit->32-bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32-bit->32-bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT, %rax + jmp ia32_sysret + +ia32_ret_from_sys_call: + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_ret_from_sys_call + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(ia32_syscall) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r10 */ + pushq $0 /* pt_regs->r11 */ + cld + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys + +ia32_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ + +ia32_sysret: + movq %rax, RAX(%rsp) +1: + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + /* + * Reload arg registers from stack in case ptrace changed them. + * Don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. But do truncate it to 32 bits. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS + jmp ia32_do_call +END(ia32_syscall) + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip), %rax + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip), %rax + /* + * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). + * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). + * + * The native 64-bit kernel's sys_clone() implements the latter, + * so we need to swap arguments here before calling it: + */ + xchg %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret +END(ia32_ptregs_common) diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S deleted file mode 100644 index 9558dacf32b9..000000000000 --- a/arch/x86/entry/ia32entry.S +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ -#include "calling.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -# define sysexit_audit ia32_ret_from_sys_call -# define sysretl_audit ia32_ret_from_sys_call -#endif - - .section .entry.text, "ax" - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) -#endif - -/* - * 32-bit SYSENTER instruction entry. - * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). - * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp user stack - * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp - movl %eax, %eax - - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp */ - pushfq /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - - /* - * no need to do an access_ok check here because rbp has been - * 32-bit zero extended - */ - ASM_STAC -1: movl (%rbp), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - - /* - * Sysenter doesn't filter flags, so we need to clear NT - * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. - */ - testl $X86_EFLAGS_NT, EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys - -sysenter_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp), %ecx /* User %eip */ - RESTORE_RSI_RDI - xorl %edx, %edx /* Do not leak kernel information */ - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r10 - movl EFLAGS(%rsp), %r11d /* User eflags */ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp), %esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - movl %esi, %r8d /* 5th arg: 4th syscall arg */ - movl %ecx, %r9d /* swap with edx */ - movl %edx, %ecx /* 4th arg: 3rd syscall arg */ - movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ - movl %ebx, %esi /* 2nd arg: 1st syscall arg */ - movl %eax, %edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl ORIG_RAX(%rsp), %eax /* reload syscall number */ - movl %ebx, %edi /* reload 1st syscall arg */ - movl RCX(%rsp), %esi /* reload 2nd syscall arg */ - movl RDX(%rsp), %edx /* reload 3rd syscall arg */ - movl RSI(%rsp), %ecx /* reload 4th syscall arg */ - movl RDI(%rsp), %r8d /* reload 5th syscall arg */ - .endm - - .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax, %esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al, %edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp), %rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_with_check - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - -sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq - jmp sysenter_flags_fixed - -sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp sysenter_do_call -ENDPROC(ia32_sysenter_target) - -/* - * 32-bit SYSCALL instruction entry. - * - * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit - * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes - * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). - * - * Arguments: - * eax system call number - * ecx return address - * ebx arg1 - * ebp arg2 (note: not saved in the stack frame, should not be touched) - * edx arg3 - * esi arg4 - * edi arg5 - * esp user stack - * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movl %esp, %r8d - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %r8 /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rbp /* pt_regs->cx */ - movl %ebp, %ecx - pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - - /* - * No need to do an access_ok check here because r8 has been - * 32-bit zero extended: - */ - ASM_STAC -1: movl (%r8), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys - -cstar_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit - -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RCX(%rsp), %ebp - RESTORE_RSI_RDI_RDX - movl RIP(%rsp), %ecx - movl EFLAGS(%rsp), %r11d - xorq %r10, %r10 - xorq %r9, %r9 - xorq %r8, %r8 - TRACE_IRQS_ON - movl RSP(%rsp), %esp - /* - * 64-bit->32-bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32-bit->32-bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - -cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp cstar_do_call -END(ia32_cstar_target) - -ia32_badarg: - ASM_CLAC - movq $-EFAULT, %rax - jmp ia32_sysret - -ia32_ret_from_sys_call: - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - - /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 */ - pushq $0 /* pt_regs->r9 */ - pushq $0 /* pt_regs->r10 */ - pushq $0 /* pt_regs->r11 */ - cld - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys - -ia32_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ - -ia32_sysret: - movq %rax, RAX(%rsp) -1: - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_EXTRA_REGS - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - /* - * Reload arg registers from stack in case ptrace changed them. - * Don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. But do truncate it to 32 bits. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - RESTORE_EXTRA_REGS - jmp ia32_do_call -END(ia32_syscall) - - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip), %rax - jmp ia32_ptregs_common - .endm - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - - ALIGN -GLOBAL(stub32_clone) - leaq sys_clone(%rip), %rax - /* - * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). - * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). - * - * The native 64-bit kernel's sys_clone() implements the latter, - * so we need to swap arguments here before calling it: - */ - xchg %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret -END(ia32_ptregs_common) -- cgit v1.2.1 From 9dac6290945142e6b87d9f027edfee676dcfbfda Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jun 2015 18:55:09 +0200 Subject: x86/mm/pat: Untangle pat_init() Split it into a BSP and AP version which makes the PAT initialization path actually readable again. Signed-off-by: Borislav Petkov Reviewed-by: Toshi Kani Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 69 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a1c96544099d..476d0780560f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -36,6 +36,8 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt +static bool boot_cpu_done; + static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); static inline void pat_disable(const char *reason) @@ -194,31 +196,47 @@ void pat_init_cache_modes(void) #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) -void pat_init(void) +static void pat_bsp_init(u64 pat) { - u64 pat; - bool boot_cpu = !boot_pat_state; + if (!cpu_has_pat) { + pat_disable("PAT not supported by CPU."); + return; + } - if (!pat_enabled()) + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); + if (!boot_pat_state) { + pat_disable("PAT MSR is 0, disabled."); return; + } + wrmsrl(MSR_IA32_CR_PAT, pat); + + pat_init_cache_modes(); +} + +static void pat_ap_init(u64 pat) +{ if (!cpu_has_pat) { - if (!boot_pat_state) { - pat_disable("PAT not supported by CPU."); - return; - } else { - /* - * If this happens we are on a secondary CPU, but - * switched to PAT on the boot CPU. We have no way to - * undo PAT. - */ - pr_err("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); - BUG(); - } + /* + * If this happens we are on a secondary CPU, but switched to + * PAT on the boot CPU. We have no way to undo PAT. + */ + panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); } - /* Set PWT to Write-Combining. All other bits stay the same */ + wrmsrl(MSR_IA32_CR_PAT, pat); +} + +void pat_init(void) +{ + u64 pat; + + if (!pat_enabled()) + return; + /* + * Set PWT to Write-Combining. All other bits stay the same: + * * PTE encoding used in Linux: * PAT * |PCD @@ -233,19 +251,12 @@ void pat_init(void) pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); - /* Boot CPU check */ - if (!boot_pat_state) { - rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); - if (!boot_pat_state) { - pat_disable("PAT read returns always zero, disabled."); - return; - } + if (!boot_cpu_done) { + pat_bsp_init(pat); + boot_cpu_done = true; + } else { + pat_ap_init(pat); } - - wrmsrl(MSR_IA32_CR_PAT, pat); - - if (boot_cpu) - pat_init_cache_modes(); } #undef PAT -- cgit v1.2.1 From 9cd25aac1f44f269de5ecea11f7d927f37f1d01c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jun 2015 18:55:10 +0200 Subject: x86/mm/pat: Emulate PAT when it is disabled In the case when PAT is disabled on the command line with "nopat" or when virtualization doesn't support PAT (correctly) - see 9d34cfdf4796 ("x86: Don't rely on VMWare emulating PAT MSR correctly"). we emulate it using the PWT and PCD cache attribute bits. Get rid of boot_pat_state while at it. Based on a conglomerate patch from Toshi Kani. Signed-off-by: Borislav Petkov Reviewed-by: Toshi Kani Acked-by: Juergen Gross Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 2 +- arch/x86/mm/init.c | 6 ++-- arch/x86/mm/pat.c | 81 ++++++++++++++++++++++++++++++---------------- arch/x86/xen/enlighten.c | 5 +-- 4 files changed, 60 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index cdcff7f7f694..ca6c228d5e62 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -6,7 +6,7 @@ bool pat_enabled(void); extern void pat_init(void); -void pat_init_cache_modes(void); +void pat_init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1d553186c434..8533b46e6bee 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -40,7 +40,7 @@ */ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WB ] = 0 | 0 , - [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 , + [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, @@ -50,11 +50,11 @@ EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 476d0780560f..6dc7826e4797 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -68,8 +68,6 @@ static int __init pat_debug_setup(char *str) } __setup("debugpat", pat_debug_setup); -static u64 __read_mostly boot_pat_state; - #ifdef CONFIG_X86_PAT /* * X86 PAT uses page flags WC and Uncached together to keep track of @@ -177,14 +175,12 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(void) +void pat_init_cache_modes(u64 pat) { - int i; enum page_cache_mode cache; char pat_msg[33]; - u64 pat; + int i; - rdmsrl(MSR_IA32_CR_PAT, pat); pat_msg[32] = 0; for (i = 7; i >= 0; i--) { cache = pat_get_cache_mode((pat >> (i * 8)) & 7, @@ -198,24 +194,33 @@ void pat_init_cache_modes(void) static void pat_bsp_init(u64 pat) { + u64 tmp_pat; + if (!cpu_has_pat) { pat_disable("PAT not supported by CPU."); return; } - rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); - if (!boot_pat_state) { + if (!pat_enabled()) + goto done; + + rdmsrl(MSR_IA32_CR_PAT, tmp_pat); + if (!tmp_pat) { pat_disable("PAT MSR is 0, disabled."); return; } wrmsrl(MSR_IA32_CR_PAT, pat); - pat_init_cache_modes(); +done: + pat_init_cache_modes(pat); } static void pat_ap_init(u64 pat) { + if (!pat_enabled()) + return; + if (!cpu_has_pat) { /* * If this happens we are on a secondary CPU, but switched to @@ -231,25 +236,45 @@ void pat_init(void) { u64 pat; - if (!pat_enabled()) - return; - - /* - * Set PWT to Write-Combining. All other bits stay the same: - * - * PTE encoding used in Linux: - * PAT - * |PCD - * ||PWT - * ||| - * 000 WB _PAGE_CACHE_WB - * 001 WC _PAGE_CACHE_WC - * 010 UC- _PAGE_CACHE_UC_MINUS - * 011 UC _PAGE_CACHE_UC - * PAT bit unused - */ - pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + if (!pat_enabled()) { + /* + * No PAT. Emulate the PAT table that corresponds to the two + * cache bits, PWT (Write Through) and PCD (Cache Disable). This + * setup is the same as the BIOS default setup when the system + * has PAT but the "nopat" boot option has been specified. This + * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * + * PTE encoding used: + * + * PCD + * |PWT PAT + * || slot + * 00 0 WB : _PAGE_CACHE_MODE_WB + * 01 1 WT : _PAGE_CACHE_MODE_WT + * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 11 3 UC : _PAGE_CACHE_MODE_UC + * + * NOTE: When WC or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } else { + /* + * PTE encoding used in Linux: + * PAT + * |PCD + * ||PWT + * ||| + * 000 WB _PAGE_CACHE_WB + * 001 WC _PAGE_CACHE_WC + * 010 UC- _PAGE_CACHE_UC_MINUS + * 011 UC _PAGE_CACHE_UC + * PAT bit unused + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + } if (!boot_cpu_done) { pat_bsp_init(pat); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 46957ead3060..53233a9beea9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1467,6 +1467,7 @@ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; + u64 pat; int rc; if (!xen_start_info) @@ -1574,8 +1575,8 @@ asmlinkage __visible void __init xen_start_kernel(void) * Modify the cache mode translation tables to match Xen's PAT * configuration. */ - - pat_init_cache_modes(); + rdmsrl(MSR_IA32_CR_PAT, pat); + pat_init_cache_modes(pat); /* keep using Xen gdt for now; no urgent need to change it */ -- cgit v1.2.1 From 7202fdb1b3299ec78dc1e7702260947ec20dd9e9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jun 2015 18:55:11 +0200 Subject: x86/mm/pat: Remove pat_enabled() checks Now that we emulate a PAT table when PAT is disabled, there's no need for those checks anymore as the PAT abstraction will handle those cases too. Based on a conglomerate patch from Toshi Kani. Signed-off-by: Borislav Petkov Reviewed-by: Toshi Kani Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 12 ++++++------ arch/x86/mm/ioremap.c | 5 +---- arch/x86/mm/pageattr.c | 3 --- arch/x86/mm/pat.c | 13 +++---------- 4 files changed, 10 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 3a2ec8790ca7..a9dc7a37e6a2 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -77,13 +77,13 @@ void __iomem * iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { /* - * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. - * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the - * MTRR is UC or WC. UC_MINUS gets the real intention, of the - * user, which is "WC if the MTRR is WC, UC if you can't do that." + * For non-PAT systems, translate non-WB request to UC- just in + * case the caller set the PWT bit to prot directly without using + * pgprot_writecombine(). UC- translates to uncached if the MTRR + * is UC or WC. UC- gets the real intention, of the user, which is + * "WC if the MTRR is WC, UC if you can't do that." */ - if (!pat_enabled() && pgprot_val(prot) == - (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC))) + if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB) prot = __pgprot(__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index b0da3588b452..cc0f17c5ad9f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -292,11 +292,8 @@ EXPORT_SYMBOL_GPL(ioremap_uc); */ void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { - if (pat_enabled()) - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, __builtin_return_address(0)); - else - return ioremap_nocache(phys_addr, size); } EXPORT_SYMBOL(ioremap_wc); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index fae3c5366ac0..31b4f3fd1207 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1572,9 +1572,6 @@ int set_memory_wc(unsigned long addr, int numpages) { int ret; - if (!pat_enabled()) - return set_memory_uc(addr, numpages); - ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 6dc7826e4797..f89e460c55a8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -438,12 +438,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, if (!pat_enabled()) { /* This is identical to page table setting without PAT */ - if (new_type) { - if (req_type == _PAGE_CACHE_MODE_WC) - *new_type = _PAGE_CACHE_MODE_UC_MINUS; - else - *new_type = req_type; - } + if (new_type) + *new_type = req_type; return 0; } @@ -947,11 +943,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, pgprot_t pgprot_writecombine(pgprot_t prot) { - if (pat_enabled()) - return __pgprot(pgprot_val(prot) | + return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); - else - return pgprot_noncached(prot); } EXPORT_SYMBOL_GPL(pgprot_writecombine); -- cgit v1.2.1 From d79a40caf8e15a2a15ecdefdc9d05865d4f37baa Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 4 Jun 2015 18:55:12 +0200 Subject: x86/mm/pat: Use 7th PAT MSR slot for Write-Through PAT type Assign Write-Through type to the PA7 slot in the PAT MSR when the processor is not affected by PAT errata. The PA7 slot is chosen to improve robustness in the presence of errata that might cause the high PAT bit to be ignored. This way a buggy PA7 slot access will hit the PA3 slot, which is UC, so at worst we lose performance without causing a correctness issue. The following Intel processors are affected by the PAT errata. Errata CPUID ---------------------------------------------------- Pentium 2, A52 family 0x6, model 0x5 Pentium 3, E27 family 0x6, model 0x7, 0x8 Pentium 3 Xenon, G26 family 0x6, model 0x7, 0x8, 0xa Pentium M, Y26 family 0x6, model 0x9 Pentium M 90nm, X9 family 0x6, model 0xd Pentium 4, N46 family 0xf, model 0x0 Instead of making sharp boundary checks, we remain conservative and exclude all Pentium 2, 3, M and 4 family processors. For those, _PAGE_CACHE_MODE_WT is redirected to UC- per the default setup in __cachemode2pte_tbl[]. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: https://lkml.kernel.org/r/1433187393-22688-2-git-send-email-toshi.kani@hp.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f89e460c55a8..59ab1a0fe21b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -235,6 +235,7 @@ static void pat_ap_init(u64 pat) void pat_init(void) { u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; if (!pat_enabled()) { /* @@ -244,7 +245,7 @@ void pat_init(void) * has PAT but the "nopat" boot option has been specified. This * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. * - * PTE encoding used: + * PTE encoding: * * PCD * |PWT PAT @@ -259,21 +260,61 @@ void pat_init(void) */ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); - } else { + + } else if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* - * PTE encoding used in Linux: + * PAT support with the lower four entries. Intel Pentium 2, + * 3, M, and 4 are affected by PAT errata, which makes the + * upper four entries unusable. To be on the safe side, we don't + * use those. + * + * PTE encoding: * PAT * |PCD - * ||PWT - * ||| - * 000 WB _PAGE_CACHE_WB - * 001 WC _PAGE_CACHE_WC - * 010 UC- _PAGE_CACHE_UC_MINUS - * 011 UC _PAGE_CACHE_UC + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC * PAT bit unused + * + * NOTE: When WT or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + } else { + /* + * Full PAT support. We put WT in slot 7 to improve + * robustness in the presence of errata that might cause + * the high PAT bit to be ignored. This way, a buggy slot 7 + * access will hit slot 3, and slot 3 is UC, so at worst + * we lose performance without causing a correctness issue. + * Pentium 4 erratum N46 is an example for such an erratum, + * although we try not to use PAT at all on affected CPUs. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * 100 4 WB : Reserved + * 101 5 WC : Reserved + * 110 6 UC-: Reserved + * 111 7 WT : _PAGE_CACHE_MODE_WT + * + * The reserved slots are unused, but mapped to their + * corresponding types in the presence of PAT errata. + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); } if (!boot_cpu_done) { -- cgit v1.2.1 From 0d69bdff451a10aa48f80509e8bf18fb24683c06 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 4 Jun 2015 18:55:13 +0200 Subject: x86/mm/pat: Change reserve_memtype() for Write-Through type When a target range is in RAM, reserve_ram_pages_type() verifies the requested type. Change it to fail WT and WP requests with -EINVAL since set_page_memtype() is limited to handle three types: WB, WC and UC-. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 59ab1a0fe21b..6311193fd1a2 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -401,9 +401,12 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) /* * For RAM pages, we use page flags to mark the pages with appropriate type. - * Here we do two pass: - * - Find the memtype of all the pages in the range, look for any conflicts - * - In case of no conflicts, set the new memtype for pages in the range + * The page flags are limited to three types, WB, WC and UC-. WT and WP requests + * fail with -EINVAL, and UC gets redirected to UC-. + * + * Here we do two passes: + * - Find the memtype of all the pages in the range, look for any conflicts. + * - In case of no conflicts, set the new memtype for pages in the range. */ static int reserve_ram_pages_type(u64 start, u64 end, enum page_cache_mode req_type, @@ -412,6 +415,13 @@ static int reserve_ram_pages_type(u64 start, u64 end, struct page *page; u64 pfn; + if ((req_type == _PAGE_CACHE_MODE_WT) || + (req_type == _PAGE_CACHE_MODE_WP)) { + if (new_type) + *new_type = _PAGE_CACHE_MODE_UC_MINUS; + return -EINVAL; + } + if (req_type == _PAGE_CACHE_MODE_UC) { /* We do not support strong UC */ WARN_ON_ONCE(1); @@ -461,6 +471,7 @@ static int free_ram_pages_type(u64 start, u64 end) * - _PAGE_CACHE_MODE_WC * - _PAGE_CACHE_MODE_UC_MINUS * - _PAGE_CACHE_MODE_UC + * - _PAGE_CACHE_MODE_WT * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return -- cgit v1.2.1 From ecb2febaaa3945e1578359adc30ca818e78540fb Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 4 Jun 2015 18:55:14 +0200 Subject: x86/mm: Teach is_new_memtype_allowed() about Write-Through type __ioremap_caller() calls reserve_memtype() and the passed down @new_pcm contains the actual page cache type it reserved in the success case. is_new_memtype_allowed() verifies if converting to the new page cache type is allowed when @pcm (the requested type) is different from @new_pcm. When WT is requested, the caller expects that writes are ordered and uncached. Therefore, enhance is_new_memtype_allowed() to disallow the following cases: - If the request is WT, mapping type cannot be WB - If the request is WT, mapping type cannot be WC Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fe57e7a98839..2562e303405b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -398,11 +398,17 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, * requested memtype: * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back + * - request is write-through, return cannot be write-back + * - request is write-through, return cannot be write-combine */ if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WC && - new_pcm == _PAGE_CACHE_MODE_WB)) { + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WT && + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WT && + new_pcm == _PAGE_CACHE_MODE_WC)) { return 0; } -- cgit v1.2.1 From d838270e2516db11084bed4e294017eb7b646a75 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 4 Jun 2015 18:55:15 +0200 Subject: x86/mm, asm-generic: Add ioremap_wt() for creating Write-Through mappings Add ioremap_wt() for creating Write-Through mappings on x86. It follows the same model as ioremap_wc() for multi-arch support. Define ARCH_HAS_IOREMAP_WT in the x86 version of io.h to indicate that ioremap_wt() is implemented on x86. Also update the PAT documentation file to cover ioremap_wt(). Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 2 ++ arch/x86/mm/ioremap.c | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index a94463063b46..83ec9b1d77cc 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,6 +35,7 @@ */ #define ARCH_HAS_IOREMAP_WC +#define ARCH_HAS_IOREMAP_WT #include #include @@ -320,6 +321,7 @@ extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index cc0f17c5ad9f..07cd46a8f30a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -172,6 +172,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, prot = __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); break; + case _PAGE_CACHE_MODE_WT: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); + break; case _PAGE_CACHE_MODE_WB: break; } @@ -297,6 +301,23 @@ void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) } EXPORT_SYMBOL(ioremap_wc); +/** + * ioremap_wt - map memory into CPU space write through + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write through. + * Write through stores data into memory while keeping the cache up-to-date. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, -- cgit v1.2.1 From d1b4bfbfac32da43bfc8425be1c4303548e20527 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 4 Jun 2015 18:55:18 +0200 Subject: x86/mm/pat: Add pgprot_writethrough() Add pgprot_writethrough() for setting page protection flags to Write-Through mode. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_types.h | 3 +++ arch/x86/mm/pat.c | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 78f0c8cbe316..13f310bfc09a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -367,6 +367,9 @@ extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); +#define pgprot_writethrough pgprot_writethrough +extern pgprot_t pgprot_writethrough(pgprot_t prot); + /* Indicate that x86 has its own track and untrack pfn vma functions */ #define __HAVE_PFNMAP_TRACKING diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 6311193fd1a2..076187c76d7a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -1000,6 +1000,13 @@ pgprot_t pgprot_writecombine(pgprot_t prot) } EXPORT_SYMBOL_GPL(pgprot_writecombine); +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) static struct memtype *memtype_get_idx(loff_t pos) -- cgit v1.2.1 From 35a5a10411d87e24b46a7a9dda8d08ef9961b783 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 4 Jun 2015 18:55:19 +0200 Subject: x86/mm/pat: Extend set_page_memtype() to support Write-Through type As set_memory_wb() calls free_ram_pages_type(), which then calls set_page_memtype() with -1, _PGMT_DEFAULT is used for tracking the WB type. _PGMT_WB is defined but unused. Thus, rename _PGMT_DEFAULT to _PGMT_WB to clarify the usage, and release the slot used by _PGMT_WB. Furthermore, change free_ram_pages_type() to call set_page_memtype() with _PGMT_WB, and get_page_memtype() to return _PAGE_CACHE_MODE_WB for _PGMT_WB. Then, define _PGMT_WT in the freed slot. This allows set_page_memtype() to track the WT type. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-12-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 59 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 076187c76d7a..188e3e07eeeb 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -70,18 +70,22 @@ __setup("debugpat", pat_debug_setup); #ifdef CONFIG_X86_PAT /* - * X86 PAT uses page flags WC and Uncached together to keep track of - * memory type of pages that have backing page struct. X86 PAT supports 3 - * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and - * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not - * been changed from its default (value of -1 used to denote this). - * Note we do not support _PAGE_CACHE_MODE_UC here. + * X86 PAT uses page flags arch_1 and uncached together to keep track of + * memory type of pages that have backing page struct. + * + * X86 PAT supports 4 different memory types: + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_WT + * + * _PAGE_CACHE_MODE_WB is the default type. */ -#define _PGMT_DEFAULT 0 +#define _PGMT_WB 0 #define _PGMT_WC (1UL << PG_arch_1) #define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) #define _PGMT_CLEAR_MASK (~_PGMT_MASK) @@ -89,14 +93,14 @@ static inline enum page_cache_mode get_page_memtype(struct page *pg) { unsigned long pg_flags = pg->flags & _PGMT_MASK; - if (pg_flags == _PGMT_DEFAULT) - return -1; + if (pg_flags == _PGMT_WB) + return _PAGE_CACHE_MODE_WB; else if (pg_flags == _PGMT_WC) return _PAGE_CACHE_MODE_WC; else if (pg_flags == _PGMT_UC_MINUS) return _PAGE_CACHE_MODE_UC_MINUS; else - return _PAGE_CACHE_MODE_WB; + return _PAGE_CACHE_MODE_WT; } static inline void set_page_memtype(struct page *pg, @@ -113,11 +117,12 @@ static inline void set_page_memtype(struct page *pg, case _PAGE_CACHE_MODE_UC_MINUS: memtype_flags = _PGMT_UC_MINUS; break; - case _PAGE_CACHE_MODE_WB: - memtype_flags = _PGMT_WB; + case _PAGE_CACHE_MODE_WT: + memtype_flags = _PGMT_WT; break; + case _PAGE_CACHE_MODE_WB: default: - memtype_flags = _PGMT_DEFAULT; + memtype_flags = _PGMT_WB; break; } @@ -401,8 +406,10 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) /* * For RAM pages, we use page flags to mark the pages with appropriate type. - * The page flags are limited to three types, WB, WC and UC-. WT and WP requests - * fail with -EINVAL, and UC gets redirected to UC-. + * The page flags are limited to four types, WB (default), WC, WT and UC-. + * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting + * a new memory type is only allowed for a page mapped with the default WB + * type. * * Here we do two passes: * - Find the memtype of all the pages in the range, look for any conflicts. @@ -415,8 +422,7 @@ static int reserve_ram_pages_type(u64 start, u64 end, struct page *page; u64 pfn; - if ((req_type == _PAGE_CACHE_MODE_WT) || - (req_type == _PAGE_CACHE_MODE_WP)) { + if (req_type == _PAGE_CACHE_MODE_WP) { if (new_type) *new_type = _PAGE_CACHE_MODE_UC_MINUS; return -EINVAL; @@ -433,7 +439,7 @@ static int reserve_ram_pages_type(u64 start, u64 end, page = pfn_to_page(pfn); type = get_page_memtype(page); - if (type != -1) { + if (type != _PAGE_CACHE_MODE_WB) { pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) @@ -460,7 +466,7 @@ static int free_ram_pages_type(u64 start, u64 end) for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - set_page_memtype(page, -1); + set_page_memtype(page, _PAGE_CACHE_MODE_WB); } return 0; } @@ -601,7 +607,7 @@ int free_memtype(u64 start, u64 end) * Only to be called when PAT is enabled * * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS - * or _PAGE_CACHE_MODE_UC + * or _PAGE_CACHE_MODE_WT. */ static enum page_cache_mode lookup_memtype(u64 paddr) { @@ -613,16 +619,9 @@ static enum page_cache_mode lookup_memtype(u64 paddr) if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { struct page *page; - page = pfn_to_page(paddr >> PAGE_SHIFT); - rettype = get_page_memtype(page); - /* - * -1 from get_page_memtype() implies RAM page is in its - * default state and not reserved, and hence of type WB - */ - if (rettype == -1) - rettype = _PAGE_CACHE_MODE_WB; - return rettype; + page = pfn_to_page(paddr >> PAGE_SHIFT); + return get_page_memtype(page); } spin_lock(&memtype_lock); -- cgit v1.2.1 From 623dffb2a2e059e1ace45b59b3ff21c66c419614 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 4 Jun 2015 18:55:20 +0200 Subject: x86/mm/pat: Add set_memory_wt() for Write-Through type Now that reserve_ram_pages_type() accepts the WT type, add set_memory_wt(), set_memory_array_wt() and set_pages_array_wt() in order to be able to set memory to Write-Through page cache mode. Also, extend ioremap_change_attr() to accept the WT type. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: hch@lst.de Cc: hmh@hmh.eng.br Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: linux-mm Cc: linux-nvdimm@lists.01.org Cc: stefan.bader@canonical.com Cc: yigal@plexistor.com Link: http://lkml.kernel.org/r/1433436928-31903-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cacheflush.h | 6 +++- arch/x86/mm/ioremap.c | 3 ++ arch/x86/mm/pageattr.c | 62 +++++++++++++++++++++++++++++++-------- 3 files changed, 57 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 47c8e32f621a..b6f7457d12e4 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -8,7 +8,7 @@ /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteBack + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent @@ -35,9 +35,11 @@ int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); int _set_memory_wb(unsigned long addr, int numpages); int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); @@ -48,10 +50,12 @@ int set_memory_4k(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); int set_memory_array_wb(unsigned long *addr, int addrinarray); int set_pages_array_uc(struct page **pages, int addrinarray); int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); int set_pages_array_wb(struct page **pages, int addrinarray); /* diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 07cd46a8f30a..8405c0c6a535 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -42,6 +42,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, case _PAGE_CACHE_MODE_WC: err = _set_memory_wc(vaddr, nrpages); break; + case _PAGE_CACHE_MODE_WT: + err = _set_memory_wt(vaddr, nrpages); + break; case _PAGE_CACHE_MODE_WB: err = _set_memory_wb(vaddr, nrpages); break; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 31b4f3fd1207..727158cb3b3c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1503,12 +1503,10 @@ EXPORT_SYMBOL(set_memory_uc); static int _set_memory_array(unsigned long *addr, int addrinarray, enum page_cache_mode new_type) { + enum page_cache_mode set_type; int i, j; int ret; - /* - * for now UC MINUS. see comments in ioremap_nocache() - */ for (i = 0; i < addrinarray; i++) { ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, new_type, NULL); @@ -1516,9 +1514,12 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, goto out_free; } + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + ret = change_page_attr_set(addr, addrinarray, - cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), - 1); + cachemode2pgprot(set_type), 1); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(addr, addrinarray, @@ -1550,6 +1551,12 @@ int set_memory_array_wc(unsigned long *addr, int addrinarray) } EXPORT_SYMBOL(set_memory_array_wc); +int set_memory_array_wt(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); +} +EXPORT_SYMBOL_GPL(set_memory_array_wt); + int _set_memory_wc(unsigned long addr, int numpages) { int ret; @@ -1575,21 +1582,39 @@ int set_memory_wc(unsigned long addr, int numpages) ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) - goto out_err; + return ret; ret = _set_memory_wc(addr, numpages); if (ret) - goto out_free; + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); - return 0; - -out_free: - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); -out_err: return ret; } EXPORT_SYMBOL(set_memory_wc); +int _set_memory_wt(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); +} + +int set_memory_wt(unsigned long addr, int numpages) +{ + int ret; + + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_MODE_WT, NULL); + if (ret) + return ret; + + ret = _set_memory_wt(addr, numpages); + if (ret) + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + + return ret; +} +EXPORT_SYMBOL_GPL(set_memory_wt); + int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ @@ -1680,6 +1705,7 @@ static int _set_pages_array(struct page **pages, int addrinarray, { unsigned long start; unsigned long end; + enum page_cache_mode set_type; int i; int free_idx; int ret; @@ -1693,8 +1719,12 @@ static int _set_pages_array(struct page **pages, int addrinarray, goto err_out; } + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + ret = cpa_set_pages_array(pages, addrinarray, - cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); + cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, addrinarray, cachemode2pgprot( @@ -1728,6 +1758,12 @@ int set_pages_array_wc(struct page **pages, int addrinarray) } EXPORT_SYMBOL(set_pages_array_wc); +int set_pages_array_wt(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); +} +EXPORT_SYMBOL_GPL(set_pages_array_wt); + int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); -- cgit v1.2.1 From bc12edb8739be10fae9f86691a3708d36d00b4f8 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:22 +0200 Subject: x86/mce: Add Local MCE definitions Add required definitions to support Local Machine Check Exceptions. Historically, machine check exceptions on Intel x86 processors have been broadcast to all logical processors in the system. Upcoming CPUs will support an opt-in mechanism to request some machine check exceptions be delivered to a single logical processor experiencing the fault. See http://www.intel.com/sdm Volume 3, System Programming Guide, chapter 15 for more information on MSRs and documentation on Local MCE. Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-15-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 5 +++++ arch/x86/include/uapi/asm/msr-index.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6a3034a0a072..ae2bfb895994 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -17,11 +17,16 @@ #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ #define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ +#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */ /* MCG_STATUS register defines */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ +#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */ + +/* MCG_EXT_CTL register defines */ +#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */ /* MCi_STATUS register defines */ #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c469490db4a8..32c69d5fbeac 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -56,6 +56,7 @@ #define MSR_IA32_MCG_CAP 0x00000179 #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_IA32_MCG_EXT_CTL 0x000004d0 #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 @@ -379,6 +380,7 @@ #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) +#define FEATURE_CONTROL_LMCE (1<<20) #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) -- cgit v1.2.1 From 88d538672ea26223bca08225bc49f4e65e71683d Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:23 +0200 Subject: x86/mce: Add infrastructure to support Local MCE Initialize and prepare for handling LMCEs. Add a boot-time option to disable LMCEs. Signed-off-by: Ashok Raj [ Simplify stuff, align statements for better readability, reflow comments; kill unused lmce_clear(); save us an MSR write if LMCE is already enabled. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-16-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 5 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 43 ++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ae2bfb895994..982dfc3679ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -109,6 +109,7 @@ struct mce_log { struct mca_config { bool dont_log_ce; bool cmci_disabled; + bool lmce_disabled; bool ignore_ce; bool disabled; bool ser; @@ -184,12 +185,16 @@ void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); +void lmce_clear(void); +void lmce_enable(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} +static inline void lmce_clear(void) {} +static inline void lmce_enable(void) {} #endif #ifdef CONFIG_X86_MCE_AMD diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0cbcd3183acf..c8c6577b4ada 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1982,6 +1982,7 @@ void mce_disable_bank(int bank) /* * mce=off Disables machine check * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) @@ -2005,6 +2006,8 @@ static int __init mcheck_enable(char *str) cfg->disabled = true; else if (!strcmp(str, "no_cmci")) cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = true; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b4a41cf030ed..2d872deb2c50 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -91,6 +91,36 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will + * generate a #GP fault. + */ + rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); + if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) + return true; + + return false; +} + bool mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) @@ -405,6 +435,19 @@ static void intel_init_cmci(void) cmci_recheck(); } +void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); -- cgit v1.2.1 From 243d657eaf540db882f73497060da5a4f7d86a90 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:24 +0200 Subject: x86/mce: Handle Local MCE events Add the necessary changes to do_machine_check() to be able to process MCEs signaled as local MCEs. Typically, only recoverable errors (SRAR type) will be Signaled as LMCE. The architecture does not restrict to only those errors, however. When errors are signaled as LMCE, there is no need for the MCE handler to perform rendezvous with other logical processors unlike earlier processors that would broadcast machine check errors. Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-17-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++------ arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c8c6577b4ada..ddc46d67d93e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1047,6 +1047,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) char *msg = "Unknown"; u64 recover_paddr = ~0ull; int flags = MF_ACTION_REQUIRED; + int lmce = 0; prev_state = ist_enter(regs); @@ -1074,11 +1075,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. + * Check if this MCE is signaled to only this logical processor */ - order = mce_start(&no_way_out); + if (m.mcgstatus & MCG_STATUS_LMCES) + lmce = 1; + else { + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + * If this is a Local MCE, then no need to perform rendezvous. + */ + order = mce_start(&no_way_out); + } + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) @@ -1155,8 +1165,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Do most of the synchronization with other CPUs. * When there's any problem use only local no_way_out state. */ - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (!lmce) { + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* + * Local MCE skipped calling mce_reign() + * If we found a fatal error, we need to panic here. + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Machine check from unknown source", + NULL, NULL); + } /* * At insane "tolerant" levels we take no action. Otherwise diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2d872deb2c50..844f56c5616d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -452,4 +452,5 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); + intel_init_lmce(); } -- cgit v1.2.1 From c8e56d20f2d190d54c0615775dcb6a23c1091681 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jun 2015 18:55:25 +0200 Subject: x86: Kill CONFIG_X86_HT In talking to Aravind recently about making certain AMD topology attributes available to the MCE injection module, it seemed like that CONFIG_X86_HT thing is more or less superfluous. It is def_bool y, depends on SMP and gets enabled in the majority of .configs - distro and otherwise - out there. So let's kill it and make code behind it depend directly on SMP. Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Bartosz Golaszewski Cc: Catalin Marinas Cc: Daniel Walter Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Igor Mammedov Cc: Jacob Shin Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433436928-31903-18-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 ++------ arch/x86/include/asm/topology.h | 2 +- arch/x86/kernel/cpu/amd.c | 6 +++--- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++---- 5 files changed, 12 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8a5cca39e74f..7e39f9b22705 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -261,10 +261,6 @@ config X86_64_SMP def_bool y depends on X86_64 && SMP -config X86_HT - def_bool y - depends on SMP - config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR @@ -865,7 +861,7 @@ config NR_CPUS config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" - depends on X86_HT + depends on SMP ---help--- SMT scheduler support improves the CPU scheduler's decision making when dealing with Intel Pentium 4 chips with HyperThreading at a @@ -875,7 +871,7 @@ config SCHED_SMT config SCHED_MC def_bool y prompt "Multi-core scheduler support" - depends on X86_HT + depends on SMP ---help--- Multi-core scheduler support improves the CPU scheduler's decision making when dealing with multi-core CPU chips at a cost of slightly diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 0e8f04f2c26f..8d717faeed22 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -26,7 +26,7 @@ #define _ASM_X86_TOPOLOGY_H #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP # define ENABLE_TOPO_DEFINES # endif #else diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e4cf63301ff4..eb4f01269b5d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -288,7 +288,7 @@ static int nearby_node(int apicid) * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u32 nodes, cores_per_cu = 1; @@ -341,7 +341,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) */ static void amd_detect_cmp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); @@ -420,7 +420,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) static void early_init_amd_mc(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits, ecx; /* Multi core CPU? */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bec0b55863e..b6fe2e47f7f1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -508,7 +508,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) void detect_ht(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; int index_msb, core_bits; static bool printed; @@ -844,7 +844,7 @@ static void generic_identify(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x00000001) { c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); # else c->apicid = c->initial_apicid; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index edcb0e28c336..be4febc58b94 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned int cpu = c->cpu_index; #endif @@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; #endif } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l3_id; #endif } -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP /* * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in * turns means that the only possibility is SMT (as indicated in -- cgit v1.2.1 From b72e7464e4cf80117938e6adb8c22fdc1ca46d42 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jun 2015 18:55:26 +0200 Subject: x86/uapi: Do not export as part of the user API headers This header containing all MSRs and respective bit definitions got exported to userspace in conjunction with the big UAPI shuffle. But, it doesn't belong in the UAPI headers because userspace can do its own MSR defines and exporting them from the kernel blocks us from doing cleanups/renames in that header. Which is ridiculous - it is not kernel's job to export such a header and keep MSRs list and their names stable. Signed-off-by: Borislav Petkov Acked-by: H. Peter Anvin Cc: Andrew Morton Cc: Andy Lutomirski Cc: David Howells Cc: Linus Torvalds Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433436928-31903-19-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 665 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/msr.h | 3 +- arch/x86/include/uapi/asm/msr-index.h | 665 ---------------------------------- arch/x86/include/uapi/asm/msr.h | 2 - 4 files changed, 667 insertions(+), 668 deletions(-) create mode 100644 arch/x86/include/asm/msr-index.h delete mode 100644 arch/x86/include/uapi/asm/msr-index.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h new file mode 100644 index 000000000000..9ebc3d009373 --- /dev/null +++ b/arch/x86/include/asm/msr-index.h @@ -0,0 +1,665 @@ +#ifndef _ASM_X86_MSR_INDEX_H +#define _ASM_X86_MSR_INDEX_H + +/* CPU model specific register (MSR) numbers */ + +/* x86-64 specific MSRs */ +#define MSR_EFER 0xc0000080 /* extended feature register */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */ +#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ +#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ +#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ +#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ +#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ + +/* EFER bits: */ +#define _EFER_SCE 0 /* SYSCALL/SYSRET */ +#define _EFER_LME 8 /* Long mode enable */ +#define _EFER_LMA 10 /* Long mode active (read-only) */ +#define _EFER_NX 11 /* No execute enable */ +#define _EFER_SVME 12 /* Enable virtualization */ +#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ +#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ + +#define EFER_SCE (1<<_EFER_SCE) +#define EFER_LME (1<<_EFER_LME) +#define EFER_LMA (1<<_EFER_LMA) +#define EFER_NX (1<<_EFER_NX) +#define EFER_SVME (1<<_EFER_SVME) +#define EFER_LMSLE (1<<_EFER_LMSLE) +#define EFER_FFXSR (1<<_EFER_FFXSR) + +/* Intel MSRs. Some also available on other CPUs */ +#define MSR_IA32_PERFCTR0 0x000000c1 +#define MSR_IA32_PERFCTR1 0x000000c2 +#define MSR_FSB_FREQ 0x000000cd +#define MSR_NHM_PLATFORM_INFO 0x000000ce + +#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 +#define NHM_C3_AUTO_DEMOTE (1UL << 25) +#define NHM_C1_AUTO_DEMOTE (1UL << 26) +#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25) +#define SNB_C1_AUTO_UNDEMOTE (1UL << 27) +#define SNB_C3_AUTO_UNDEMOTE (1UL << 28) + +#define MSR_PLATFORM_INFO 0x000000ce +#define MSR_MTRRcap 0x000000fe +#define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e + +#define MSR_IA32_SYSENTER_CS 0x00000174 +#define MSR_IA32_SYSENTER_ESP 0x00000175 +#define MSR_IA32_SYSENTER_EIP 0x00000176 + +#define MSR_IA32_MCG_CAP 0x00000179 +#define MSR_IA32_MCG_STATUS 0x0000017a +#define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_IA32_MCG_EXT_CTL 0x000004d0 + +#define MSR_OFFCORE_RSP_0 0x000001a6 +#define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae +#define MSR_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_TURBO_RATIO_LIMIT1 0x000001ae +#define MSR_TURBO_RATIO_LIMIT2 0x000001af + +#define MSR_LBR_SELECT 0x000001c8 +#define MSR_LBR_TOS 0x000001c9 +#define MSR_LBR_NHM_FROM 0x00000680 +#define MSR_LBR_NHM_TO 0x000006c0 +#define MSR_LBR_CORE_FROM 0x00000040 +#define MSR_LBR_CORE_TO 0x00000060 + +#define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_IA32_DS_AREA 0x00000600 +#define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 + +#define MSR_IA32_RTIT_CTL 0x00000570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_OS BIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_TSC_EN BIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define MSR_IA32_RTIT_STATUS 0x00000571 +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPED BIT(5) +#define MSR_IA32_RTIT_CR3_MATCH 0x00000572 +#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 +#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 + +#define MSR_MTRRfix64K_00000 0x00000250 +#define MSR_MTRRfix16K_80000 0x00000258 +#define MSR_MTRRfix16K_A0000 0x00000259 +#define MSR_MTRRfix4K_C0000 0x00000268 +#define MSR_MTRRfix4K_C8000 0x00000269 +#define MSR_MTRRfix4K_D0000 0x0000026a +#define MSR_MTRRfix4K_D8000 0x0000026b +#define MSR_MTRRfix4K_E0000 0x0000026c +#define MSR_MTRRfix4K_E8000 0x0000026d +#define MSR_MTRRfix4K_F0000 0x0000026e +#define MSR_MTRRfix4K_F8000 0x0000026f +#define MSR_MTRRdefType 0x000002ff + +#define MSR_IA32_CR_PAT 0x00000277 + +#define MSR_IA32_DEBUGCTLMSR 0x000001d9 +#define MSR_IA32_LASTBRANCHFROMIP 0x000001db +#define MSR_IA32_LASTBRANCHTOIP 0x000001dc +#define MSR_IA32_LASTINTFROMIP 0x000001dd +#define MSR_IA32_LASTINTTOIP 0x000001de + +/* DEBUGCTLMSR bits (others vary by model): */ +#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_TR (1UL << 6) +#define DEBUGCTLMSR_BTS (1UL << 7) +#define DEBUGCTLMSR_BTINT (1UL << 8) +#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) +#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) +#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) + +#define MSR_IA32_POWER_CTL 0x000001fc + +#define MSR_IA32_MC0_CTL 0x00000400 +#define MSR_IA32_MC0_STATUS 0x00000401 +#define MSR_IA32_MC0_ADDR 0x00000402 +#define MSR_IA32_MC0_MISC 0x00000403 + +/* C-state Residency Counters */ +#define MSR_PKG_C3_RESIDENCY 0x000003f8 +#define MSR_PKG_C6_RESIDENCY 0x000003f9 +#define MSR_PKG_C7_RESIDENCY 0x000003fa +#define MSR_CORE_C3_RESIDENCY 0x000003fc +#define MSR_CORE_C6_RESIDENCY 0x000003fd +#define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff +#define MSR_PKG_C2_RESIDENCY 0x0000060d +#define MSR_PKG_C8_RESIDENCY 0x00000630 +#define MSR_PKG_C9_RESIDENCY 0x00000631 +#define MSR_PKG_C10_RESIDENCY 0x00000632 + +/* Run Time Average Power Limiting (RAPL) Interface */ + +#define MSR_RAPL_POWER_UNIT 0x00000606 + +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_PERF_STATUS 0x00000613 +#define MSR_PKG_POWER_INFO 0x00000614 + +#define MSR_DRAM_POWER_LIMIT 0x00000618 +#define MSR_DRAM_ENERGY_STATUS 0x00000619 +#define MSR_DRAM_PERF_STATUS 0x0000061b +#define MSR_DRAM_POWER_INFO 0x0000061c + +#define MSR_PP0_POWER_LIMIT 0x00000638 +#define MSR_PP0_ENERGY_STATUS 0x00000639 +#define MSR_PP0_POLICY 0x0000063a +#define MSR_PP0_PERF_STATUS 0x0000063b + +#define MSR_PP1_POWER_LIMIT 0x00000640 +#define MSR_PP1_ENERGY_STATUS 0x00000641 +#define MSR_PP1_POLICY 0x00000642 + +#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 +#define MSR_PKG_ANY_CORE_C0_RES 0x00000659 +#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A +#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B + +#define MSR_CORE_C1_RES 0x00000660 + +#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 +#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 + +#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690 +#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 +#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 + +/* Hardware P state interface */ +#define MSR_PPERF 0x0000064e +#define MSR_PERF_LIMIT_REASONS 0x0000064f +#define MSR_PM_ENABLE 0x00000770 +#define MSR_HWP_CAPABILITIES 0x00000771 +#define MSR_HWP_REQUEST_PKG 0x00000772 +#define MSR_HWP_INTERRUPT 0x00000773 +#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_STATUS 0x00000777 + +/* CPUID.6.EAX */ +#define HWP_BASE_BIT (1<<7) +#define HWP_NOTIFICATIONS_BIT (1<<8) +#define HWP_ACTIVITY_WINDOW_BIT (1<<9) +#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10) +#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11) + +/* IA32_HWP_CAPABILITIES */ +#define HWP_HIGHEST_PERF(x) (x & 0xff) +#define HWP_GUARANTEED_PERF(x) ((x & (0xff << 8)) >>8) +#define HWP_MOSTEFFICIENT_PERF(x) ((x & (0xff << 16)) >>16) +#define HWP_LOWEST_PERF(x) ((x & (0xff << 24)) >>24) + +/* IA32_HWP_REQUEST */ +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) +#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24) +#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42) + +/* IA32_HWP_STATUS */ +#define HWP_GUARANTEED_CHANGE(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4) + +/* IA32_HWP_INTERRUPT */ +#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2) + +#define MSR_AMD64_MC0_MASK 0xc0010044 + +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + +/* These are consecutive and not in the normal 4er MCE bank block */ +#define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + +#define MSR_P6_PERFCTR0 0x000000c1 +#define MSR_P6_PERFCTR1 0x000000c2 +#define MSR_P6_EVNTSEL0 0x00000186 +#define MSR_P6_EVNTSEL1 0x00000187 + +#define MSR_KNC_PERFCTR0 0x00000020 +#define MSR_KNC_PERFCTR1 0x00000021 +#define MSR_KNC_EVNTSEL0 0x00000028 +#define MSR_KNC_EVNTSEL1 0x00000029 + +/* Alternative perfctr range with full access. */ +#define MSR_IA32_PMC0 0x000004c1 + +/* AMD64 MSRs. Not complete. See the architecture manual for a more + complete list. */ + +#define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 +#define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD64_LS_CFG 0xc0011020 +#define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_BU_CFG2 0xc001102a +#define MSR_AMD64_IBSFETCHCTL 0xc0011030 +#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 +#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSFETCH_REG_COUNT 3 +#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL< +#include "msr-index.h" #ifndef __ASSEMBLY__ #include #include #include +#include struct msr { union { diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h deleted file mode 100644 index 9ebc3d009373..000000000000 --- a/arch/x86/include/uapi/asm/msr-index.h +++ /dev/null @@ -1,665 +0,0 @@ -#ifndef _ASM_X86_MSR_INDEX_H -#define _ASM_X86_MSR_INDEX_H - -/* CPU model specific register (MSR) numbers */ - -/* x86-64 specific MSRs */ -#define MSR_EFER 0xc0000080 /* extended feature register */ -#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ -#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ -#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */ -#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ -#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ -#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ -#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ -#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ - -/* EFER bits: */ -#define _EFER_SCE 0 /* SYSCALL/SYSRET */ -#define _EFER_LME 8 /* Long mode enable */ -#define _EFER_LMA 10 /* Long mode active (read-only) */ -#define _EFER_NX 11 /* No execute enable */ -#define _EFER_SVME 12 /* Enable virtualization */ -#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ -#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ - -#define EFER_SCE (1<<_EFER_SCE) -#define EFER_LME (1<<_EFER_LME) -#define EFER_LMA (1<<_EFER_LMA) -#define EFER_NX (1<<_EFER_NX) -#define EFER_SVME (1<<_EFER_SVME) -#define EFER_LMSLE (1<<_EFER_LMSLE) -#define EFER_FFXSR (1<<_EFER_FFXSR) - -/* Intel MSRs. Some also available on other CPUs */ -#define MSR_IA32_PERFCTR0 0x000000c1 -#define MSR_IA32_PERFCTR1 0x000000c2 -#define MSR_FSB_FREQ 0x000000cd -#define MSR_NHM_PLATFORM_INFO 0x000000ce - -#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 -#define NHM_C3_AUTO_DEMOTE (1UL << 25) -#define NHM_C1_AUTO_DEMOTE (1UL << 26) -#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25) -#define SNB_C1_AUTO_UNDEMOTE (1UL << 27) -#define SNB_C3_AUTO_UNDEMOTE (1UL << 28) - -#define MSR_PLATFORM_INFO 0x000000ce -#define MSR_MTRRcap 0x000000fe -#define MSR_IA32_BBL_CR_CTL 0x00000119 -#define MSR_IA32_BBL_CR_CTL3 0x0000011e - -#define MSR_IA32_SYSENTER_CS 0x00000174 -#define MSR_IA32_SYSENTER_ESP 0x00000175 -#define MSR_IA32_SYSENTER_EIP 0x00000176 - -#define MSR_IA32_MCG_CAP 0x00000179 -#define MSR_IA32_MCG_STATUS 0x0000017a -#define MSR_IA32_MCG_CTL 0x0000017b -#define MSR_IA32_MCG_EXT_CTL 0x000004d0 - -#define MSR_OFFCORE_RSP_0 0x000001a6 -#define MSR_OFFCORE_RSP_1 0x000001a7 -#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad -#define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae -#define MSR_TURBO_RATIO_LIMIT 0x000001ad -#define MSR_TURBO_RATIO_LIMIT1 0x000001ae -#define MSR_TURBO_RATIO_LIMIT2 0x000001af - -#define MSR_LBR_SELECT 0x000001c8 -#define MSR_LBR_TOS 0x000001c9 -#define MSR_LBR_NHM_FROM 0x00000680 -#define MSR_LBR_NHM_TO 0x000006c0 -#define MSR_LBR_CORE_FROM 0x00000040 -#define MSR_LBR_CORE_TO 0x00000060 - -#define MSR_IA32_PEBS_ENABLE 0x000003f1 -#define MSR_IA32_DS_AREA 0x00000600 -#define MSR_IA32_PERF_CAPABILITIES 0x00000345 -#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 - -#define MSR_IA32_RTIT_CTL 0x00000570 -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_OS BIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_TSC_EN BIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define MSR_IA32_RTIT_STATUS 0x00000571 -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPED BIT(5) -#define MSR_IA32_RTIT_CR3_MATCH 0x00000572 -#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 -#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 - -#define MSR_MTRRfix64K_00000 0x00000250 -#define MSR_MTRRfix16K_80000 0x00000258 -#define MSR_MTRRfix16K_A0000 0x00000259 -#define MSR_MTRRfix4K_C0000 0x00000268 -#define MSR_MTRRfix4K_C8000 0x00000269 -#define MSR_MTRRfix4K_D0000 0x0000026a -#define MSR_MTRRfix4K_D8000 0x0000026b -#define MSR_MTRRfix4K_E0000 0x0000026c -#define MSR_MTRRfix4K_E8000 0x0000026d -#define MSR_MTRRfix4K_F0000 0x0000026e -#define MSR_MTRRfix4K_F8000 0x0000026f -#define MSR_MTRRdefType 0x000002ff - -#define MSR_IA32_CR_PAT 0x00000277 - -#define MSR_IA32_DEBUGCTLMSR 0x000001d9 -#define MSR_IA32_LASTBRANCHFROMIP 0x000001db -#define MSR_IA32_LASTBRANCHTOIP 0x000001dc -#define MSR_IA32_LASTINTFROMIP 0x000001dd -#define MSR_IA32_LASTINTTOIP 0x000001de - -/* DEBUGCTLMSR bits (others vary by model): */ -#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ -#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ -#define DEBUGCTLMSR_TR (1UL << 6) -#define DEBUGCTLMSR_BTS (1UL << 7) -#define DEBUGCTLMSR_BTINT (1UL << 8) -#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) -#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) -#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) - -#define MSR_IA32_POWER_CTL 0x000001fc - -#define MSR_IA32_MC0_CTL 0x00000400 -#define MSR_IA32_MC0_STATUS 0x00000401 -#define MSR_IA32_MC0_ADDR 0x00000402 -#define MSR_IA32_MC0_MISC 0x00000403 - -/* C-state Residency Counters */ -#define MSR_PKG_C3_RESIDENCY 0x000003f8 -#define MSR_PKG_C6_RESIDENCY 0x000003f9 -#define MSR_PKG_C7_RESIDENCY 0x000003fa -#define MSR_CORE_C3_RESIDENCY 0x000003fc -#define MSR_CORE_C6_RESIDENCY 0x000003fd -#define MSR_CORE_C7_RESIDENCY 0x000003fe -#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff -#define MSR_PKG_C2_RESIDENCY 0x0000060d -#define MSR_PKG_C8_RESIDENCY 0x00000630 -#define MSR_PKG_C9_RESIDENCY 0x00000631 -#define MSR_PKG_C10_RESIDENCY 0x00000632 - -/* Run Time Average Power Limiting (RAPL) Interface */ - -#define MSR_RAPL_POWER_UNIT 0x00000606 - -#define MSR_PKG_POWER_LIMIT 0x00000610 -#define MSR_PKG_ENERGY_STATUS 0x00000611 -#define MSR_PKG_PERF_STATUS 0x00000613 -#define MSR_PKG_POWER_INFO 0x00000614 - -#define MSR_DRAM_POWER_LIMIT 0x00000618 -#define MSR_DRAM_ENERGY_STATUS 0x00000619 -#define MSR_DRAM_PERF_STATUS 0x0000061b -#define MSR_DRAM_POWER_INFO 0x0000061c - -#define MSR_PP0_POWER_LIMIT 0x00000638 -#define MSR_PP0_ENERGY_STATUS 0x00000639 -#define MSR_PP0_POLICY 0x0000063a -#define MSR_PP0_PERF_STATUS 0x0000063b - -#define MSR_PP1_POWER_LIMIT 0x00000640 -#define MSR_PP1_ENERGY_STATUS 0x00000641 -#define MSR_PP1_POLICY 0x00000642 - -#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 -#define MSR_PKG_ANY_CORE_C0_RES 0x00000659 -#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A -#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B - -#define MSR_CORE_C1_RES 0x00000660 - -#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 -#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 - -#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690 -#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 -#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 - -/* Hardware P state interface */ -#define MSR_PPERF 0x0000064e -#define MSR_PERF_LIMIT_REASONS 0x0000064f -#define MSR_PM_ENABLE 0x00000770 -#define MSR_HWP_CAPABILITIES 0x00000771 -#define MSR_HWP_REQUEST_PKG 0x00000772 -#define MSR_HWP_INTERRUPT 0x00000773 -#define MSR_HWP_REQUEST 0x00000774 -#define MSR_HWP_STATUS 0x00000777 - -/* CPUID.6.EAX */ -#define HWP_BASE_BIT (1<<7) -#define HWP_NOTIFICATIONS_BIT (1<<8) -#define HWP_ACTIVITY_WINDOW_BIT (1<<9) -#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10) -#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11) - -/* IA32_HWP_CAPABILITIES */ -#define HWP_HIGHEST_PERF(x) (x & 0xff) -#define HWP_GUARANTEED_PERF(x) ((x & (0xff << 8)) >>8) -#define HWP_MOSTEFFICIENT_PERF(x) ((x & (0xff << 16)) >>16) -#define HWP_LOWEST_PERF(x) ((x & (0xff << 24)) >>24) - -/* IA32_HWP_REQUEST */ -#define HWP_MIN_PERF(x) (x & 0xff) -#define HWP_MAX_PERF(x) ((x & 0xff) << 8) -#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) -#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24) -#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32) -#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42) - -/* IA32_HWP_STATUS */ -#define HWP_GUARANTEED_CHANGE(x) (x & 0x1) -#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4) - -/* IA32_HWP_INTERRUPT */ -#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1) -#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2) - -#define MSR_AMD64_MC0_MASK 0xc0010044 - -#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) -#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) -#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) -#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) - -#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) - -/* These are consecutive and not in the normal 4er MCE bank block */ -#define MSR_IA32_MC0_CTL2 0x00000280 -#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) - -#define MSR_P6_PERFCTR0 0x000000c1 -#define MSR_P6_PERFCTR1 0x000000c2 -#define MSR_P6_EVNTSEL0 0x00000186 -#define MSR_P6_EVNTSEL1 0x00000187 - -#define MSR_KNC_PERFCTR0 0x00000020 -#define MSR_KNC_PERFCTR1 0x00000021 -#define MSR_KNC_EVNTSEL0 0x00000028 -#define MSR_KNC_EVNTSEL1 0x00000029 - -/* Alternative perfctr range with full access. */ -#define MSR_IA32_PMC0 0x000004c1 - -/* AMD64 MSRs. Not complete. See the architecture manual for a more - complete list. */ - -#define MSR_AMD64_PATCH_LEVEL 0x0000008b -#define MSR_AMD64_TSC_RATIO 0xc0000104 -#define MSR_AMD64_NB_CFG 0xc001001f -#define MSR_AMD64_PATCH_LOADER 0xc0010020 -#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 -#define MSR_AMD64_OSVW_STATUS 0xc0010141 -#define MSR_AMD64_LS_CFG 0xc0011020 -#define MSR_AMD64_DC_CFG 0xc0011022 -#define MSR_AMD64_BU_CFG2 0xc001102a -#define MSR_AMD64_IBSFETCHCTL 0xc0011030 -#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 -#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 -#define MSR_AMD64_IBSFETCH_REG_COUNT 3 -#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL< - #ifndef __ASSEMBLY__ #include -- cgit v1.2.1 From ee38a90709084b1c91279cde8f783e90f85285a1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jun 2015 18:55:27 +0200 Subject: x86/microcode: Disable builtin microcode loading on 32-bit for now Andy Shevchenko reported machine freezes when booting latest tip on 32-bit setups. Problem is, the builtin microcode handling cannot really work that early, when we haven't even enabled paging. A proper fix would involve handling that case specially as every other early 32-bit boot case in the microcode loader and would require much more involved changes for which it is too late now, more than a week before the upcoming merge window. So, disable the builtin microcode loading on 32-bit for now. Reported-and-tested-by: Andy Shevchenko Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433436928-31903-20-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/amd_early.c | 4 ++++ arch/x86/kernel/cpu/microcode/intel_early.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 9208a36d0f03..9243cd839829 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -230,6 +230,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) static bool __init load_builtin_amd_microcode(struct cpio_data *cp, int family) { +#ifdef CONFIG_X86_64 char fw_name[36] = "amd-ucode/microcode_amd.bin"; if (family >= 0x15) @@ -237,6 +238,9 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, int family) "amd-ucode/microcode_amd_fam%.2xh.bin", family); return get_builtin_firmware(cp, fw_name); +#else + return false; +#endif } void __init load_ucode_amd_bsp(int family) diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 10dff3f3f686..b4858d892592 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -523,6 +523,7 @@ EXPORT_SYMBOL_GPL(save_mc_for_early); static bool __init load_builtin_intel_microcode(struct cpio_data *cp) { +#ifdef CONFIG_X86_64 u32 eax = 0x00000001, ebx, ecx = 0, edx; int family, model, stepping; char name[30]; @@ -536,6 +537,9 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping); return get_builtin_firmware(cp, name); +#else + return false; +#endif } static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; -- cgit v1.2.1 From 7b179b8feba3b887be5ddd501c25d924cf44d70a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 4 Jun 2015 18:55:28 +0200 Subject: x86/microcode: Correct CPU family related variable types Change the type of variables and function prototypes to be in alignment with what the x86_*() / __x86_*() family/model functions return. Signed-off-by: Andy Shevchenko Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433436928-31903-21-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode_amd.h | 4 ++-- arch/x86/kernel/cpu/microcode/amd_early.c | 5 +++-- arch/x86/kernel/cpu/microcode/core_early.c | 3 ++- arch/x86/kernel/cpu/microcode/intel_early.c | 4 ++-- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index b8438543f340..ac6d328977a6 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -65,12 +65,12 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; #ifdef CONFIG_MICROCODE_AMD_EARLY -extern void __init load_ucode_amd_bsp(int family); +extern void __init load_ucode_amd_bsp(unsigned int family); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); void reload_ucode_amd(void); #else -static inline void __init load_ucode_amd_bsp(int family) {} +static inline void __init load_ucode_amd_bsp(unsigned int family) {} static inline void load_ucode_amd_ap(void) {} static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } void reload_ucode_amd(void) {} diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 9243cd839829..e8a215a9a345 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -228,7 +228,8 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) } } -static bool __init load_builtin_amd_microcode(struct cpio_data *cp, int family) +static bool __init load_builtin_amd_microcode(struct cpio_data *cp, + unsigned int family) { #ifdef CONFIG_X86_64 char fw_name[36] = "amd-ucode/microcode_amd.bin"; @@ -243,7 +244,7 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, int family) #endif } -void __init load_ucode_amd_bsp(int family) +void __init load_ucode_amd_bsp(unsigned int family) { struct cpio_data cp; void **data; diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index 979ca78b1561..8ebc421d6299 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -66,7 +66,8 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) void __init load_ucode_bsp(void) { - int vendor, family; + int vendor; + unsigned int family; if (check_loader_disabled_bsp()) return; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index b4858d892592..8187b7247d1c 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -524,8 +524,8 @@ EXPORT_SYMBOL_GPL(save_mc_for_early); static bool __init load_builtin_intel_microcode(struct cpio_data *cp) { #ifdef CONFIG_X86_64 - u32 eax = 0x00000001, ebx, ecx = 0, edx; - int family, model, stepping; + unsigned int eax = 0x00000001, ebx, ecx = 0, edx; + unsigned int family, model, stepping; char name[30]; native_cpuid(&eax, &ebx, &ecx, &edx); -- cgit v1.2.1 From 8cf1a3de97804b047973dd44cfacdc1930da8403 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 26 May 2015 09:10:35 -0400 Subject: perf/x86/intel/uncore: Fix CBOX bit wide and UBOX reg on Haswell-EP CBOX counters are increased to 48b on HSX. Correct the MSR address for HSWEP_U_MSR_PMON_CTR0 and HSWEP_U_MSR_PMON_CTL0. See specification in: http://www.intel.com/content/www/us/en/processors/xeon/ xeon-e5-v3-uncore-performance-monitoring.html Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1432645835-7918-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 12d9548457e7..6d6e85dd5849 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -164,8 +164,8 @@ ((1ULL << (n)) - 1))) /* Haswell-EP Ubox */ -#define HSWEP_U_MSR_PMON_CTR0 0x705 -#define HSWEP_U_MSR_PMON_CTL0 0x709 +#define HSWEP_U_MSR_PMON_CTR0 0x709 +#define HSWEP_U_MSR_PMON_CTL0 0x705 #define HSWEP_U_MSR_PMON_FILTER 0x707 #define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL 0x703 @@ -1914,7 +1914,7 @@ static struct intel_uncore_type hswep_uncore_cbox = { .name = "cbox", .num_counters = 4, .num_boxes = 18, - .perf_ctr_bits = 44, + .perf_ctr_bits = 48, .event_ctl = HSWEP_C0_MSR_PMON_CTL0, .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, -- cgit v1.2.1 From 4eaca0a887eaee04fc7a3866d0f5b51b34030dfa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Jun 2015 17:39:08 +0200 Subject: preempt: Use preempt_schedule_context() as the official tracing preemption point preempt_schedule_context() is a tracing safe preemption point but it's only used when CONFIG_CONTEXT_TRACKING=y. Other configs have tracing recursion issues since commit: b30f0e3ffedf ("sched/preempt: Optimize preemption operations on __schedule() callers") introduced function based preemp_count_*() ops. Lets make it available on all configs and give it a more appropriate name for its new position. Reported-by: Fengguang Wu Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433432349-1021-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 8 +++----- arch/x86/kernel/i386_ksyms_32.c | 4 +--- arch/x86/kernel/x8664_ksyms_64.c | 4 +--- arch/x86/lib/thunk_32.S | 4 +--- arch/x86/lib/thunk_64.S | 4 +--- 5 files changed, 7 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 8f3271842533..dca71714f860 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -99,11 +99,9 @@ static __always_inline bool should_resched(void) extern asmlinkage void ___preempt_schedule(void); # define __preempt_schedule() asm ("call ___preempt_schedule") extern asmlinkage void preempt_schedule(void); -# ifdef CONFIG_CONTEXT_TRACKING - extern asmlinkage void ___preempt_schedule_context(void); -# define __preempt_schedule_context() asm ("call ___preempt_schedule_context") - extern asmlinkage void preempt_schedule_context(void); -# endif + extern asmlinkage void ___preempt_schedule_notrace(void); +# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace") + extern asmlinkage void preempt_schedule_notrace(void); #endif #endif /* __ASM_PREEMPT_H */ diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 05fd74f537d6..64341aa485ae 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -40,7 +40,5 @@ EXPORT_SYMBOL(empty_zero_page); #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(___preempt_schedule); -#ifdef CONFIG_CONTEXT_TRACKING -EXPORT_SYMBOL(___preempt_schedule_context); -#endif +EXPORT_SYMBOL(___preempt_schedule_notrace); #endif diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 37d8fa4438f0..a0695be19864 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -75,7 +75,5 @@ EXPORT_SYMBOL(native_load_gs_index); #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(___preempt_schedule); -#ifdef CONFIG_CONTEXT_TRACKING -EXPORT_SYMBOL(___preempt_schedule_context); -#endif +EXPORT_SYMBOL(___preempt_schedule_notrace); #endif diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index 5eb715087b80..e407941d0488 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -38,8 +38,6 @@ #ifdef CONFIG_PREEMPT THUNK ___preempt_schedule, preempt_schedule -#ifdef CONFIG_CONTEXT_TRACKING - THUNK ___preempt_schedule_context, preempt_schedule_context -#endif + THUNK ___preempt_schedule_notrace, preempt_schedule_notrace #endif diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index f89ba4e93025..2198902329b5 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -49,9 +49,7 @@ #ifdef CONFIG_PREEMPT THUNK ___preempt_schedule, preempt_schedule -#ifdef CONFIG_CONTEXT_TRACKING - THUNK ___preempt_schedule_context, preempt_schedule_context -#endif + THUNK ___preempt_schedule_notrace, preempt_schedule_notrace #endif #if defined(CONFIG_TRACE_IRQFLAGS) \ -- cgit v1.2.1 From 7b74cfb2ecb4d56a25c89cdb561e4926db85feb1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 14 May 2015 23:09:59 +0200 Subject: perf/x86/intel: add support for PERF_SAMPLE_BRANCH_IND_JUMP This patch enables support for branch sampling filter for indirect jumps (IND_JUMP). It enables LBR IND_JMP filtering where available. There is also software filtering support. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@redhat.com Cc: dsahern@gmail.com Cc: jolsa@redhat.com Cc: kan.liang@intel.com Cc: namhyung@kernel.org Link: http://lkml.kernel.org/r/1431637800-31061-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 94e5b506caa6..201e16f6655a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -96,6 +96,7 @@ enum { X86_BR_NO_TX = 1 << 14,/* not in transaction */ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ X86_BR_CALL_STACK = 1 << 16,/* call stack */ + X86_BR_IND_JMP = 1 << 17,/* indirect jump */ }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -113,6 +114,7 @@ enum { X86_BR_IRQ |\ X86_BR_ABORT |\ X86_BR_IND_CALL |\ + X86_BR_IND_JMP |\ X86_BR_ZERO_CALL) #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) @@ -523,6 +525,9 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) X86_BR_CALL_STACK; } + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) + mask |= X86_BR_IND_JMP; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -736,7 +741,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) break; case 4: case 5: - ret = X86_BR_JMP; + ret = X86_BR_IND_JMP; break; } break; @@ -844,6 +849,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { */ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, }; static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -856,6 +862,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { | LBR_FAR, [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, }; static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -870,6 +877,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | LBR_CALL_STACK, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, }; /* core */ -- cgit v1.2.1 From 851559e35fd5ab637783ba395e55edd50f761229 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 6 May 2015 15:33:47 -0400 Subject: perf/x86/intel: Use the PEBS auto reload mechanism when possible When a fixed period is specified, this patch makes perf use the PEBS auto reload mechanism. This makes normal profiling faster, because it avoids one costly MSR write in the PMI handler. However, the reset value will be loaded by hardware assist. There is a small delay compared to the previous non-auto-reload mechanism. The delay time is arbitrary, but very small. The assist cost is 400-800 cycles, assuming common cases with everything cached. The minimum period the patch currently uses is 10000. In that extreme case it can be ~10% if cycles are used. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1430940834-8964-2-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 15 +++++++++------ arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 8 ++++++-- arch/x86/kernel/cpu/perf_event_intel_ds.c | 7 +++++++ 4 files changed, 23 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index dbe3328f8ad7..9560d0fc6fa6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1094,13 +1094,16 @@ int x86_perf_event_set_period(struct perf_event *event) per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; - /* - * The hw event starts counting from this event offset, - * mark it to be able to extra future deltas: - */ - local64_set(&hwc->prev_count, (u64)-left); + if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) || + local64_read(&hwc->prev_count) != (u64)-left) { + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + } /* * Due to erratum on certan cpu we need diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 89e6cd61e6ae..7a3f0fdd2fbd 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -75,6 +75,7 @@ struct event_constraint { #define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ #define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ struct amd_nb { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 74f19d9268bb..17628930a80e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2260,8 +2260,12 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; - if (event->attr.precise_ip && x86_pmu.pebs_aliases) - x86_pmu.pebs_aliases(event); + if (event->attr.precise_ip) { + if (!event->attr.freq) + event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; + if (x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); + } if (needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f73b3553e2e..4802d5dec222 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -688,6 +688,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; + struct debug_store *ds = cpuc->ds; hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; @@ -697,6 +698,12 @@ void intel_pmu_pebs_enable(struct perf_event *event) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled |= 1ULL << 63; + + /* Use auto-reload if possible to save a MSR write in the PMI */ + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { + ds->pebs_event_reset[hwc->idx] = + (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; + } } void intel_pmu_pebs_disable(struct perf_event *event) -- cgit v1.2.1 From 43cf76312faefed098c057082abac8a3d521e1dc Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 6 May 2015 15:33:48 -0400 Subject: perf/x86/intel: Introduce setup_pebs_sample_data() Move code that sets up the PEBS sample data to a separate function. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1430940834-8964-3-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 95 +++++++++++++++++-------------- 1 file changed, 52 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 4802d5dec222..a5fe561c4902 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -853,8 +853,10 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) return txn; } -static void __intel_pmu_pebs_event(struct perf_event *event, - struct pt_regs *iregs, void *__pebs) +static void setup_pebs_sample_data(struct perf_event *event, + struct pt_regs *iregs, void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) { #define PERF_X86_EVENT_PEBS_HSW_PREC \ (PERF_X86_EVENT_PEBS_ST_HSW | \ @@ -866,30 +868,25 @@ static void __intel_pmu_pebs_event(struct perf_event *event, */ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_record_hsw *pebs = __pebs; - struct perf_sample_data data; - struct pt_regs regs; u64 sample_type; int fll, fst, dsrc; int fl = event->hw.flags; - if (!intel_pmu_save_and_restart(event)) - return; - sample_type = event->attr.sample_type; dsrc = sample_type & PERF_SAMPLE_DATA_SRC; fll = fl & PERF_X86_EVENT_PEBS_LDLAT; fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(data, 0, event->hw.last_period); - data.period = event->hw.last_period; + data->period = event->hw.last_period; /* * Use latency for weight (only avail with PEBS-LL) */ if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data.weight = pebs->lat; + data->weight = pebs->lat; /* * data.data_src encodes the data source @@ -902,7 +899,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, val = precise_datala_hsw(event, pebs->dse); else if (fst) val = precise_store_data(pebs->dse); - data.data_src.val = val; + data->data_src.val = val; } /* @@ -915,58 +912,70 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. * A possible PERF_SAMPLE_REGS will have to transfer all regs. */ - regs = *iregs; - regs.flags = pebs->flags; - set_linear_ip(®s, pebs->ip); - regs.bp = pebs->bp; - regs.sp = pebs->sp; + *regs = *iregs; + regs->flags = pebs->flags; + set_linear_ip(regs, pebs->ip); + regs->bp = pebs->bp; + regs->sp = pebs->sp; if (sample_type & PERF_SAMPLE_REGS_INTR) { - regs.ax = pebs->ax; - regs.bx = pebs->bx; - regs.cx = pebs->cx; - regs.dx = pebs->dx; - regs.si = pebs->si; - regs.di = pebs->di; - regs.bp = pebs->bp; - regs.sp = pebs->sp; - - regs.flags = pebs->flags; + regs->ax = pebs->ax; + regs->bx = pebs->bx; + regs->cx = pebs->cx; + regs->dx = pebs->dx; + regs->si = pebs->si; + regs->di = pebs->di; + regs->bp = pebs->bp; + regs->sp = pebs->sp; + + regs->flags = pebs->flags; #ifndef CONFIG_X86_32 - regs.r8 = pebs->r8; - regs.r9 = pebs->r9; - regs.r10 = pebs->r10; - regs.r11 = pebs->r11; - regs.r12 = pebs->r12; - regs.r13 = pebs->r13; - regs.r14 = pebs->r14; - regs.r15 = pebs->r15; + regs->r8 = pebs->r8; + regs->r9 = pebs->r9; + regs->r10 = pebs->r10; + regs->r11 = pebs->r11; + regs->r12 = pebs->r12; + regs->r13 = pebs->r13; + regs->r14 = pebs->r14; + regs->r15 = pebs->r15; #endif } if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { - regs.ip = pebs->real_ip; - regs.flags |= PERF_EFLAGS_EXACT; - } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) - regs.flags |= PERF_EFLAGS_EXACT; + regs->ip = pebs->real_ip; + regs->flags |= PERF_EFLAGS_EXACT; + } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs)) + regs->flags |= PERF_EFLAGS_EXACT; else - regs.flags &= ~PERF_EFLAGS_EXACT; + regs->flags &= ~PERF_EFLAGS_EXACT; if ((sample_type & PERF_SAMPLE_ADDR) && x86_pmu.intel_cap.pebs_format >= 1) - data.addr = pebs->dla; + data->addr = pebs->dla; if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data.weight = intel_hsw_weight(pebs); + data->weight = intel_hsw_weight(pebs); if (sample_type & PERF_SAMPLE_TRANSACTION) - data.txn = intel_hsw_transaction(pebs); + data->txn = intel_hsw_transaction(pebs); } if (has_branch_stack(event)) - data.br_stack = &cpuc->lbr_stack; + data->br_stack = &cpuc->lbr_stack; +} + +static void __intel_pmu_pebs_event(struct perf_event *event, + struct pt_regs *iregs, void *__pebs) +{ + struct perf_sample_data data; + struct pt_regs regs; + + if (!intel_pmu_save_and_restart(event)) + return; + + setup_pebs_sample_data(event, iregs, __pebs, &data, ®s); if (perf_event_overflow(event, &data, ®s)) x86_pmu_stop(event, 0); -- cgit v1.2.1 From 21509084f999d7accd32e45961ef76853112e978 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 6 May 2015 15:33:49 -0400 Subject: perf/x86/intel: Handle multiple records in the PEBS buffer When the PEBS interrupt threshold is larger than one record and the machine supports multiple PEBS events, the records of these events are mixed up and we need to demultiplex them. Demuxing the records is hard because the hardware is deficient. The hardware has two issues that, when combined, create impossible scenarios to demux. The first issue is that the 'status' field of the PEBS record is a copy of the GLOBAL_STATUS MSR at PEBS assist time. To see why this is a problem let us first describe the regular PEBS cycle: A) the CTRn value reaches 0: - the corresponding bit in GLOBAL_STATUS gets set - we start arming the hardware assist < some unspecified amount of time later -- this could cover multiple events of interest > B) the hardware assist is armed, any next event will trigger it C) a matching event happens: - the hardware assist triggers and generates a PEBS record this includes a copy of GLOBAL_STATUS at this moment - if we auto-reload we (re)set CTRn - we clear the relevant bit in GLOBAL_STATUS Now consider the following chain of events: A0, B0, A1, C0 The event generated for counter 0 will include a status with counter 1 set, even though its not at all related to the record. A similar thing can happen with a !PEBS event if it just happens to overflow at the right moment. The second issue is that the hardware will only emit one record for two or more counters if the event that triggers the assist is 'close'. The 'close' can be several cycles. In some cases even the complete assist, if the event is something that doesn't need retirement. For instance, consider this chain of events: A0, B0, A1, B1, C01 Where C01 is an event that triggers both hardware assists, we will generate but a single record, but again with both counters listed in the status field. This time the record pertains to both events. Note that these two cases are different but undistinguishable with the data as generated. Therefore demuxing records with multiple PEBS bits (we can safely ignore status bits for !PEBS counters) is impossible. Furthermore we cannot emit the record to both events because that might cause a data leak -- the events might not have the same privileges -- so what this patch does is discard such events. The assumption/hope is that such discards will be rare. Here lists some possible ways you may get high discard rate. - when you count the same thing multiple times. But it is not a useful configuration. - you can be unfortunate if you measure with a userspace only PEBS event along with either a kernel or unrestricted PEBS event. Imagine the event triggering and setting the overflow flag right before entering the kernel. Then all kernel side events will end up with multiple bits set. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang [ Changelog improvements. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1430940834-8964-4-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 142 +++++++++++++++++++++--------- 1 file changed, 100 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index a5fe561c4902..72529c237e6e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -872,6 +872,9 @@ static void setup_pebs_sample_data(struct perf_event *event, int fll, fst, dsrc; int fl = event->hw.flags; + if (pebs == NULL) + return; + sample_type = event->attr.sample_type; dsrc = sample_type & PERF_SAMPLE_DATA_SRC; @@ -966,19 +969,68 @@ static void setup_pebs_sample_data(struct perf_event *event, data->br_stack = &cpuc->lbr_stack; } +static inline void * +get_next_pebs_record_by_bit(void *base, void *top, int bit) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + void *at; + u64 pebs_status; + + if (base == NULL) + return NULL; + + for (at = base; at < top; at += x86_pmu.pebs_record_size) { + struct pebs_record_nhm *p = at; + + if (test_bit(bit, (unsigned long *)&p->status)) { + + if (p->status == (1 << bit)) + return at; + + /* clear non-PEBS bit and re-check */ + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; + if (pebs_status == (1 << bit)) + return at; + } + } + return NULL; +} + static void __intel_pmu_pebs_event(struct perf_event *event, - struct pt_regs *iregs, void *__pebs) + struct pt_regs *iregs, + void *base, void *top, + int bit, int count) { struct perf_sample_data data; struct pt_regs regs; + int i; + void *at = get_next_pebs_record_by_bit(base, top, bit); - if (!intel_pmu_save_and_restart(event)) + if (!intel_pmu_save_and_restart(event) && + !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)) return; - setup_pebs_sample_data(event, iregs, __pebs, &data, ®s); + if (count > 1) { + for (i = 0; i < count - 1; i++) { + setup_pebs_sample_data(event, iregs, at, &data, ®s); + perf_event_output(event, &data, ®s); + at += x86_pmu.pebs_record_size; + at = get_next_pebs_record_by_bit(at, top, bit); + } + } + + setup_pebs_sample_data(event, iregs, at, &data, ®s); - if (perf_event_overflow(event, &data, ®s)) + /* + * All but the last records are processed. + * The last one is left to be able to call the overflow handler. + */ + if (perf_event_overflow(event, &data, ®s)) { x86_pmu_stop(event, 0); + return; + } + } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) @@ -1008,72 +1060,78 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) if (!event->attr.precise_ip) return; - n = top - at; + n = (top - at) / x86_pmu.pebs_record_size; if (n <= 0) return; - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); - at += n - 1; - - __intel_pmu_pebs_event(event, iregs, at); + __intel_pmu_pebs_event(event, iregs, at, top, 0, n); } static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; - struct perf_event *event = NULL; - void *at, *top; - u64 status = 0; + struct perf_event *event; + void *base, *at, *top; int bit; + short counts[MAX_PEBS_EVENTS] = {}; if (!x86_pmu.pebs_active) return; - at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; + base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; ds->pebs_index = ds->pebs_buffer_base; - if (unlikely(at > top)) + if (unlikely(base >= top)) return; - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size, - "Unexpected number of pebs records %ld\n", - (long)(top - at) / x86_pmu.pebs_record_size); - - for (; at < top; at += x86_pmu.pebs_record_size) { + for (at = base; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; - for_each_set_bit(bit, (unsigned long *)&p->status, - x86_pmu.max_pebs_events) { - event = cpuc->events[bit]; - if (!test_bit(bit, cpuc->active_mask)) - continue; - - WARN_ON_ONCE(!event); - - if (!event->attr.precise_ip) - continue; + bit = find_first_bit((unsigned long *)&p->status, + x86_pmu.max_pebs_events); + if (bit >= x86_pmu.max_pebs_events) + continue; + if (!test_bit(bit, cpuc->active_mask)) + continue; + /* + * The PEBS hardware does not deal well with the situation + * when events happen near to each other and multiple bits + * are set. But it should happen rarely. + * + * If these events include one PEBS and multiple non-PEBS + * events, it doesn't impact PEBS record. The record will + * be handled normally. (slow path) + * + * If these events include two or more PEBS events, the + * records for the events can be collapsed into a single + * one, and it's not possible to reconstruct all events + * that caused the PEBS record. It's called collision. + * If collision happened, the record will be dropped. + * + */ + if (p->status != (1 << bit)) { + u64 pebs_status; - if (__test_and_set_bit(bit, (unsigned long *)&status)) + /* slow path */ + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; + if (pebs_status != (1 << bit)) continue; - - break; } + counts[bit]++; + } - if (!event || bit >= x86_pmu.max_pebs_events) + for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { + if (counts[bit] == 0) continue; + event = cpuc->events[bit]; + WARN_ON_ONCE(!event); + WARN_ON_ONCE(!event->attr.precise_ip); - __intel_pmu_pebs_event(event, iregs, at); + __intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]); } } -- cgit v1.2.1 From 3569c0d7c5440d6fd06b10e1ef9614588a049bc7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 6 May 2015 15:33:50 -0400 Subject: perf/x86/intel: Implement batched PEBS interrupt handling (large PEBS interrupt threshold) PEBS always had the capability to log samples to its buffers without an interrupt. Traditionally perf has not used this but always set the PEBS threshold to one. For frequently occurring events (like cycles or branches or load/store) this in term requires using a relatively high sampling period to avoid overloading the system, by only processing PMIs. This in term increases sampling error. For the common cases we still need to use the PMI because the PEBS hardware has various limitations. The biggest one is that it can not supply a callgraph. It also requires setting a fixed period, as the hardware does not support adaptive period. Another issue is that it cannot supply a time stamp and some other options. To supply a TID it requires flushing on context switch. It can however supply the IP, the load/store address, TSX information, registers, and some other things. So we can make PEBS work for some specific cases, basically as long as you can do without a callgraph and can set the period you can use this new PEBS mode. The main benefit is the ability to support much lower sampling period (down to -c 1000) without extensive overhead. One use cases is for example to increase the resolution of the c2c tool. Another is double checking when you suspect the standard sampling has too much sampling error. Some numbers on the overhead, using cycle soak, comparing the elapsed time from "kernbench -M -H" between plain (threshold set to one) and multi (large threshold). The test command for plain: "perf record --time -e cycles:p -c $period -- kernbench -M -H" The test command for multi: "perf record --no-time -e cycles:p -c $period -- kernbench -M -H" ( The only difference of test command between multi and plain is time stamp options. Since time stamp is not supported by large PEBS threshold, it can be used as a flag to indicate if large threshold is enabled during the test. ) period plain(Sec) multi(Sec) Delta 10003 32.7 16.5 16.2 20003 30.2 16.2 14.0 40003 18.6 14.1 4.5 80003 16.8 14.6 2.2 100003 16.9 14.1 2.8 800003 15.4 15.7 -0.3 1000003 15.3 15.2 0.2 2000003 15.3 15.1 0.1 With periods below 100003, plain (threshold one) cause much more overhead. With 10003 sampling period, the Elapsed Time for multi is even 2X faster than plain. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1430940834-8964-5-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 11 +++++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 5 ++++- arch/x86/kernel/cpu/perf_event_intel_ds.c | 27 +++++++++++++++++++++++---- 3 files changed, 38 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7a3f0fdd2fbd..a73dfc97226b 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -76,6 +76,7 @@ struct event_constraint { #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ #define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */ struct amd_nb { @@ -88,6 +89,16 @@ struct amd_nb { /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 +/* + * Flags PEBS can handle without an PMI. + * + */ +#define PEBS_FREERUNNING_FLAGS \ + (PERF_SAMPLE_IP | PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ + PERF_SAMPLE_TRANSACTION) + /* * A debug store configuration. * diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 17628930a80e..6985f43c5eb9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2261,8 +2261,11 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { - if (!event->attr.freq) + if (!event->attr.freq) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; + if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) + event->hw.flags |= PERF_X86_EVENT_FREERUNNING; + } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 72529c237e6e..0ce455d958b8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -250,7 +250,7 @@ static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; int node = cpu_to_node(cpu); - int max, thresh = 1; /* always use a single PEBS record */ + int max; void *buffer, *ibuffer; if (!x86_pmu.pebs) @@ -280,9 +280,6 @@ static int alloc_pebs_buffer(int cpu) ds->pebs_absolute_maximum = ds->pebs_buffer_base + max * x86_pmu.pebs_record_size; - ds->pebs_interrupt_threshold = ds->pebs_buffer_base + - thresh * x86_pmu.pebs_record_size; - return 0; } @@ -684,14 +681,22 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) return &emptyconstraint; } +static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc) +{ + return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1)); +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; + bool first_pebs; + u64 threshold; hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + first_pebs = !pebs_is_enabled(cpuc); cpuc->pebs_enabled |= 1ULL << hwc->idx; if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) @@ -699,11 +704,25 @@ void intel_pmu_pebs_enable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled |= 1ULL << 63; + /* + * When the event is constrained enough we can use a larger + * threshold and run the event with less frequent PMI. + */ + if (hwc->flags & PERF_X86_EVENT_FREERUNNING) { + threshold = ds->pebs_absolute_maximum - + x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; + } else { + threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + } + /* Use auto-reload if possible to save a MSR write in the PMI */ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { ds->pebs_event_reset[hwc->idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; } + + if (first_pebs || ds->pebs_interrupt_threshold > threshold) + ds->pebs_interrupt_threshold = threshold; } void intel_pmu_pebs_disable(struct perf_event *event) -- cgit v1.2.1 From 9c964efa4330a58520783effe9847f15126fef1f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 6 May 2015 15:33:51 -0400 Subject: perf/x86/intel: Drain the PEBS buffer during context switches Flush the PEBS buffer during context switches if PEBS interrupt threshold is larger than one. This allows perf to supply TID for sample outputs. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1430940834-8964-6-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 6 +++++- arch/x86/kernel/cpu/perf_event_intel.c | 11 +++++++++- arch/x86/kernel/cpu/perf_event_intel_ds.c | 32 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 --- 4 files changed, 47 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index a73dfc97226b..74089bcb6d74 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -92,9 +92,11 @@ struct amd_nb { /* * Flags PEBS can handle without an PMI. * + * TID can only be handled by flushing at context switch. + * */ #define PEBS_FREERUNNING_FLAGS \ - (PERF_SAMPLE_IP | PERF_SAMPLE_ADDR | \ + (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION) @@ -877,6 +879,8 @@ void intel_pmu_pebs_enable_all(void); void intel_pmu_pebs_disable_all(void); +void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); + void intel_ds_init(void); void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6985f43c5eb9..d455e2a61287 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2642,6 +2642,15 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } +static void intel_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (x86_pmu.pebs_active) + intel_pmu_pebs_sched_task(ctx, sched_in); + if (x86_pmu.lbr_nr) + intel_pmu_lbr_sched_task(ctx, sched_in); +} + PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -2731,7 +2740,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, - .sched_task = intel_pmu_lbr_sched_task, + .sched_task = intel_pmu_sched_task, }; static __init void intel_clovertown_quirk(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 0ce455d958b8..62852470ccbb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -546,6 +546,19 @@ int intel_pmu_drain_bts_buffer(void) return 1; } +static inline void intel_pmu_drain_pebs_buffer(void) +{ + struct pt_regs regs; + + x86_pmu.drain_pebs(®s); +} + +void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + if (!sched_in) + intel_pmu_drain_pebs_buffer(); +} + /* * PEBS */ @@ -711,8 +724,19 @@ void intel_pmu_pebs_enable(struct perf_event *event) if (hwc->flags & PERF_X86_EVENT_FREERUNNING) { threshold = ds->pebs_absolute_maximum - x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; + + if (first_pebs) + perf_sched_cb_inc(event->ctx->pmu); } else { threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + + /* + * If not all events can use larger buffer, + * roll back to threshold = 1 + */ + if (!first_pebs && + (ds->pebs_interrupt_threshold > threshold)) + perf_sched_cb_dec(event->ctx->pmu); } /* Use auto-reload if possible to save a MSR write in the PMI */ @@ -729,6 +753,7 @@ void intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; + struct debug_store *ds = cpuc->ds; cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -737,6 +762,13 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); + if (ds->pebs_interrupt_threshold > + ds->pebs_buffer_base + x86_pmu.pebs_record_size) { + intel_pmu_drain_pebs_buffer(); + if (!pebs_is_enabled(cpuc)) + perf_sched_cb_dec(event->ctx->pmu); + } + if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 201e16f6655a..452a7bd2dedb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -264,9 +264,6 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; - if (!x86_pmu.lbr_nr) - return; - /* * If LBR callstack feature is enabled and the stack was saved when * the task was scheduled out, restore the stack. Otherwise flush -- cgit v1.2.1 From 156174999dd1d0fe8732f5a05f4e9cef921ad487 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 6 May 2015 15:33:52 -0400 Subject: perf/intel/x86: Enlarge the PEBS buffer Currently the PEBS buffer size is 4k, it can only hold about 21 PEBS records. This patch enlarges the PEBS buffer size to 64k (the same as the BTS buffer). 64k memory can hold about 330 PEBS records. This will significantly reduce the number of PMIs when batched PEBS interrupts are enabled. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1430940834-8964-7-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 62852470ccbb..266079a3a646 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -11,7 +11,7 @@ #define BTS_RECORD_SIZE 24 #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE PAGE_SIZE +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_FIXUP_SIZE PAGE_SIZE /* -- cgit v1.2.1 From f38b0dbb491a6987e198aa6b428db8692a6480f8 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Sun, 10 May 2015 15:13:14 -0400 Subject: perf/x86/intel: Introduce PERF_RECORD_LOST_SAMPLES After enlarging the PEBS interrupt threshold, there may be some mixed up PEBS samples which are discarded by the kernel. This patch makes the kernel emit a PERF_RECORD_LOST_SAMPLES record with the number of possible discarded records when it is impossible to demux the samples. It makes sure the user is not left in the dark about such discards. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285195-14269-8-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 266079a3a646..34d0c4816141 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1126,6 +1126,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) void *base, *at, *top; int bit; short counts[MAX_PEBS_EVENTS] = {}; + short error[MAX_PEBS_EVENTS] = {}; if (!x86_pmu.pebs_active) return; @@ -1169,20 +1170,33 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) /* slow path */ pebs_status = p->status & cpuc->pebs_enabled; pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; - if (pebs_status != (1 << bit)) + if (pebs_status != (1 << bit)) { + u8 i; + + for_each_set_bit(i, (unsigned long *)&pebs_status, + MAX_PEBS_EVENTS) + error[i]++; continue; + } } counts[bit]++; } for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { - if (counts[bit] == 0) + if ((counts[bit] == 0) && (error[bit] == 0)) continue; event = cpuc->events[bit]; WARN_ON_ONCE(!event); WARN_ON_ONCE(!event->attr.precise_ip); - __intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]); + /* log dropped samples number */ + if (error[bit]) + perf_log_lost_samples(event, error[bit]); + + if (counts[bit]) { + __intel_pmu_pebs_event(event, iregs, base, + top, bit, counts[bit]); + } } } -- cgit v1.2.1 From a3d86542de8850be52e8589da22b24002941dfb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2015 15:18:18 +0200 Subject: perf/x86/intel/pebs: Add PEBSv3 decoding PEBSv3 as present on Skylake fixed the long standing issue of the status bits. They now really reflect the events that generated the record. Tested-by: Andi Kleen Tested-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 34d0c4816141..71fc40238843 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1034,6 +1034,9 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) struct pebs_record_nhm *p = at; if (test_bit(bit, (unsigned long *)&p->status)) { + /* PEBS v3 has accurate status bits */ + if (x86_pmu.intel_cap.pebs_format >= 3) + return at; if (p->status == (1 << bit)) return at; @@ -1055,20 +1058,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event, { struct perf_sample_data data; struct pt_regs regs; - int i; void *at = get_next_pebs_record_by_bit(base, top, bit); if (!intel_pmu_save_and_restart(event) && !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)) return; - if (count > 1) { - for (i = 0; i < count - 1; i++) { - setup_pebs_sample_data(event, iregs, at, &data, ®s); - perf_event_output(event, &data, ®s); - at += x86_pmu.pebs_record_size; - at = get_next_pebs_record_by_bit(at, top, bit); - } + while (count > 1) { + setup_pebs_sample_data(event, iregs, at, &data, ®s); + perf_event_output(event, &data, ®s); + at += x86_pmu.pebs_record_size; + at = get_next_pebs_record_by_bit(at, top, bit); + count--; } setup_pebs_sample_data(event, iregs, at, &data, ®s); @@ -1124,9 +1125,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) struct debug_store *ds = cpuc->ds; struct perf_event *event; void *base, *at, *top; - int bit; short counts[MAX_PEBS_EVENTS] = {}; short error[MAX_PEBS_EVENTS] = {}; + int bit, i; if (!x86_pmu.pebs_active) return; @@ -1142,6 +1143,15 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) for (at = base; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; + /* PEBS v3 has accurate status bits */ + if (x86_pmu.intel_cap.pebs_format >= 3) { + for_each_set_bit(bit, (unsigned long *)&p->status, + MAX_PEBS_EVENTS) + counts[bit]++; + + continue; + } + bit = find_first_bit((unsigned long *)&p->status, x86_pmu.max_pebs_events); if (bit >= x86_pmu.max_pebs_events) @@ -1171,8 +1181,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) pebs_status = p->status & cpuc->pebs_enabled; pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; if (pebs_status != (1 << bit)) { - u8 i; - for_each_set_bit(i, (unsigned long *)&pebs_status, MAX_PEBS_EVENTS) error[i]++; -- cgit v1.2.1 From 2cd23553b488589f287457b7396470f5e3c40698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:28:07 +0200 Subject: x86/asm/entry: Rename compat syscall entry points Rename the following system call entry points: ia32_cstar_target -> entry_SYSCALL_compat ia32_syscall -> entry_INT80_compat The generic naming scheme for x86 system call entry points is: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 8 ++++---- arch/x86/entry/syscall_32.c | 6 +++--- arch/x86/include/asm/proto.h | 4 ++-- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 9558dacf32b9..8058892fb5ff 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -288,7 +288,7 @@ ENDPROC(ia32_sysenter_target) * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. */ -ENTRY(ia32_cstar_target) +ENTRY(entry_SYSCALL_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -409,7 +409,7 @@ cstar_tracesys: RESTORE_EXTRA_REGS jmp cstar_do_call -END(ia32_cstar_target) +END(entry_SYSCALL_compat) ia32_badarg: ASM_CLAC @@ -445,7 +445,7 @@ ia32_ret_from_sys_call: * Assumes it is only called from user space and entered with interrupts off. */ -ENTRY(ia32_syscall) +ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -511,7 +511,7 @@ ia32_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp ia32_do_call -END(ia32_syscall) +END(entry_INT80_compat) .macro PTREGSCALL label, func ALIGN diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 3777189c4a19..e398d033673f 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,7 +10,7 @@ #else #define SYM(sym, compat) sym #define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max +#define __NR_entry_INT80_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_entry_INT80_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, + [0 ... __NR_entry_INT80_compat_max] = &sys_ni_syscall, #include }; diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index a90f8972dad5..7d2961a231f1 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -8,8 +8,8 @@ void system_call(void); void syscall_init(void); -void ia32_syscall(void); -void ia32_cstar_target(void); +void entry_INT80_compat(void); +void entry_SYSCALL_compat(void); void ia32_sysenter_target(void); void x86_configure_nx(void); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index dcaab87da629..599afcf0005f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,7 +66,7 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); - DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(__NR_entry_INT80_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bec0b55863e..f0b85c401014 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1207,7 +1207,7 @@ void syscall_init(void) wrmsrl(MSR_LSTAR, system_call); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, ia32_cstar_target); + wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5e0791f9d3dc..edf97986a53d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -992,7 +992,7 @@ void __init trap_init(void) set_bit(i, used_vectors); #ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 04529e620559..3c43c03a499c 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -122,7 +122,7 @@ ENDPROC(xen_syscall_target) /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) undo_xen_syscall - jmp ia32_cstar_target + jmp entry_SYSCALL_compat ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ -- cgit v1.2.1 From 4c8cd0c50d0b1559727bf0ec7ff27caeba2dfe09 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:33:56 +0200 Subject: x86/asm/entry: Untangle 'ia32_sysenter_target' into two entry points: entry_SYSENTER_32 and entry_SYSENTER_compat So the SYSENTER instruction is pretty quirky and it has different behavior depending on bitness and CPU maker. Yet we create a false sense of coherency by naming it 'ia32_sysenter_target' in both of the cases. Split the name into its two uses: ia32_sysenter_target (32) -> entry_SYSENTER_32 ia32_sysenter_target (64) -> entry_SYSENTER_compat As per the generic naming scheme for x86 system call entry points: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 10 +++++----- arch/x86/entry/entry_64_compat.S | 4 ++-- arch/x86/include/asm/proto.h | 3 ++- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/xen/xen-asm_64.S | 2 +- 5 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0ac73de925d1..a65f46c3b8e1 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -307,7 +307,7 @@ END(resume_kernel) the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub -ENTRY(ia32_sysenter_target) +ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* @@ -412,7 +412,7 @@ sysexit_audit: .popsection _ASM_EXTABLE(1b,2b) PTGS_TO_GS_EX -ENDPROC(ia32_sysenter_target) +ENDPROC(entry_SYSENTER_32) # system call handler stub ENTRY(system_call) @@ -1135,7 +1135,7 @@ END(page_fault) ENTRY(debug) ASM_CLAC - cmpl $ia32_sysenter_target,(%esp) + cmpl $entry_SYSENTER_32,(%esp) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: @@ -1165,7 +1165,7 @@ ENTRY(nmi) popl %eax je nmi_espfix_stack #endif - cmpl $ia32_sysenter_target,(%esp) + cmpl $entry_SYSENTER_32,(%esp) je nmi_stack_fixup pushl %eax movl %esp,%eax @@ -1176,7 +1176,7 @@ ENTRY(nmi) cmpl $(THREAD_SIZE-20),%eax popl %eax jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) + cmpl $entry_SYSENTER_32,12(%esp) je nmi_debug_stack_check nmi_stack_correct: pushl %eax diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 8058892fb5ff..59840e33d203 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,7 +57,7 @@ ENDPROC(native_usergs_sysret32) * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. */ -ENTRY(ia32_sysenter_target) +ENTRY(entry_SYSENTER_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -256,7 +256,7 @@ sysenter_tracesys: RESTORE_EXTRA_REGS jmp sysenter_do_call -ENDPROC(ia32_sysenter_target) +ENDPROC(entry_SYSENTER_compat) /* * 32-bit SYSCALL instruction entry. diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 7d2961a231f1..83a7f8227949 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -10,7 +10,8 @@ void syscall_init(void); void entry_INT80_compat(void); void entry_SYSCALL_compat(void); -void ia32_sysenter_target(void); +void entry_SYSENTER_32(void); +void entry_SYSENTER_compat(void); void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f0b85c401014..b2ae7cec33ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1026,7 +1026,7 @@ void enable_sep_cpu(void) (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); out: put_cpu(); @@ -1216,7 +1216,7 @@ void syscall_init(void) */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 3c43c03a499c..ccac1b1e6e93 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -128,7 +128,7 @@ ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) undo_xen_syscall - jmp ia32_sysenter_target + jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ -- cgit v1.2.1 From b2502b418e63fcde0fe1857732a476b5aa3789b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:42:03 +0200 Subject: x86/asm/entry: Untangle 'system_call' into two entry points: entry_SYSCALL_64 and entry_INT80_32 The 'system_call' entry points differ starkly between native 32-bit and 64-bit kernels: on 32-bit kernels it defines the INT 0x80 entry point, while on 64-bit it's the SYSCALL entry point. This is pretty confusing when looking at generic code, and it also obscures the nature of the entry point at the assembly level. So unangle this by splitting the name into its two uses: system_call (32) -> entry_INT80_32 system_call (64) -> entry_SYSCALL_64 As per the generic naming scheme for x86 system call entry points: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 10 +++++----- arch/x86/include/asm/proto.h | 5 +++-- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/traps.c | 5 ++--- arch/x86/xen/xen-asm_64.S | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a65f46c3b8e1..d59461032625 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -415,7 +415,7 @@ sysexit_audit: ENDPROC(entry_SYSENTER_32) # system call handler stub -ENTRY(system_call) +ENTRY(entry_INT80_32) ASM_CLAC pushl %eax # save orig_eax SAVE_ALL @@ -508,7 +508,7 @@ ldt_ss: lss (%esp), %esp /* switch to espfix segment */ jmp restore_nocheck #endif -ENDPROC(system_call) +ENDPROC(entry_INT80_32) # perform work that needs to be done immediately before resumption ALIGN diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4cf3dd36aa0d..e1852c407155 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -137,7 +137,7 @@ ENDPROC(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ -ENTRY(system_call) +ENTRY(entry_SYSCALL_64) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -149,7 +149,7 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -GLOBAL(system_call_after_swapgs) +GLOBAL(entry_SYSCALL_64_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp @@ -182,7 +182,7 @@ GLOBAL(system_call_after_swapgs) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys -system_call_fastpath: +entry_SYSCALL_64_fastpath: #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax #else @@ -246,7 +246,7 @@ tracesys: jnz tracesys_phase2 /* if needed, run the slow path */ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ movq ORIG_RAX(%rsp), %rax - jmp system_call_fastpath /* and return to the fast path */ + jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ tracesys_phase2: SAVE_EXTRA_REGS @@ -411,7 +411,7 @@ syscall_return_via_sysret: opportunistic_sysret_failed: SWAPGS jmp restore_c_regs_and_iret -END(system_call) +END(entry_SYSCALL_64) .macro FORK_LIKE func diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 83a7f8227949..a4a77286cb1d 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -5,11 +5,12 @@ /* misc architecture specific prototypes */ -void system_call(void); void syscall_init(void); -void entry_INT80_compat(void); +void entry_SYSCALL_64(void); void entry_SYSCALL_compat(void); +void entry_INT80_32(void); +void entry_INT80_compat(void); void entry_SYSENTER_32(void); void entry_SYSENTER_compat(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b2ae7cec33ca..914be4bbc2e5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1204,7 +1204,7 @@ void syscall_init(void) * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_LSTAR, entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index edf97986a53d..001ddac221a1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -72,8 +72,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include #include - -asmlinkage int system_call(void); +#include #endif /* Must be page-aligned because the real IDT is used in a fixmap. */ @@ -997,7 +996,7 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call); + set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index ccac1b1e6e93..f22667abf7b9 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -114,7 +114,7 @@ RELOC(xen_sysret32, 1b+1) /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) undo_xen_syscall - jmp system_call_after_swapgs + jmp entry_SYSCALL_64_after_swapgs ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION -- cgit v1.2.1 From a49976d14f780942dafafbbf16f891c27d385ea0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 09:49:11 +0200 Subject: x86/asm/entry/32: Clean up entry_32.S Make the 32-bit syscall entry code a bit more readable: - use consistent assembly coding style similar to entry_64.S - remove old comments that are not true anymore - eliminate whitespace noise - use consistent vertical spacing - fix various comments No code changed: # arch/x86/entry/entry_32.o: text data bss dec hex filename 6025 0 0 6025 1789 entry_32.o.before 6025 0 0 6025 1789 entry_32.o.after md5: f3fa16b2b0dca804f052deb6b30ba6cb entry_32.o.before.asm f3fa16b2b0dca804f052deb6b30ba6cb entry_32.o.after.asm Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 1141 ++++++++++++++++++++++----------------------- 1 file changed, 570 insertions(+), 571 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d59461032625..edd7aadfacfa 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1,23 +1,12 @@ /* + * Copyright (C) 1991,1992 Linus Torvalds * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. + * entry_32.S contains the system-call and low-level fault and trap handling routines. * * Stack layout in 'syscall_exit': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace needs to have all registers on the stack. + * If the order here is changed, it needs to be + * updated in fork.c:copy_process(), signal.c:do_signal(), * ptrace.c and ptrace.h * * 0(%esp) - %ebx @@ -37,8 +26,6 @@ * 38(%esp) - %eflags * 3C(%esp) - %oldesp * 40(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. */ #include @@ -61,11 +48,11 @@ /* Avoid __ASSEMBLER__'ifying just for this. */ #include #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 +#define __AUDIT_ARCH_LE 0x40000000 #ifndef CONFIG_AUDITSYSCALL -#define sysenter_audit syscall_trace_entry -#define sysexit_audit syscall_exit_work +# define sysenter_audit syscall_trace_entry +# define sysexit_audit syscall_exit_work #endif .section .entry.text, "ax" @@ -84,16 +71,16 @@ */ #ifdef CONFIG_PREEMPT -#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else -#define preempt_stop(clobbers) -#define resume_kernel restore_all +# define preempt_stop(clobbers) +# define resume_kernel restore_all #endif .macro TRACE_IRQS_IRET #ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? - jz 1f + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? + jz 1f TRACE_IRQS_ON 1: #endif @@ -112,10 +99,10 @@ /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl $0 + pushl $0 .endm .macro POP_GS pop=0 - addl $(4 + \pop), %esp + addl $(4 + \pop), %esp .endm .macro POP_GS_EX .endm @@ -135,119 +122,119 @@ #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl %gs + pushl %gs .endm .macro POP_GS pop=0 -98: popl %gs +98: popl %gs .if \pop <> 0 add $\pop, %esp .endif .endm .macro POP_GS_EX .pushsection .fixup, "ax" -99: movl $0, (%esp) - jmp 98b +99: movl $0, (%esp) + jmp 98b .popsection - _ASM_EXTABLE(98b,99b) + _ASM_EXTABLE(98b, 99b) .endm .macro PTGS_TO_GS -98: mov PT_GS(%esp), %gs +98: mov PT_GS(%esp), %gs .endm .macro PTGS_TO_GS_EX .pushsection .fixup, "ax" -99: movl $0, PT_GS(%esp) - jmp 98b +99: movl $0, PT_GS(%esp) + jmp 98b .popsection - _ASM_EXTABLE(98b,99b) + _ASM_EXTABLE(98b, 99b) .endm .macro GS_TO_REG reg - movl %gs, \reg + movl %gs, \reg .endm .macro REG_TO_PTGS reg - movl \reg, PT_GS(%esp) + movl \reg, PT_GS(%esp) .endm .macro SET_KERNEL_GS reg - movl $(__KERNEL_STACK_CANARY), \reg - movl \reg, %gs + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs .endm -#endif /* CONFIG_X86_32_LAZY_GS */ +#endif /* CONFIG_X86_32_LAZY_GS */ .macro SAVE_ALL cld PUSH_GS - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - movl $(__USER_DS), %edx - movl %edx, %ds - movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs SET_KERNEL_GS %edx .endm .macro RESTORE_INT_REGS - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl %ds -2: popl %es -3: popl %fs +1: popl %ds +2: popl %es +3: popl %fs POP_GS \pop .pushsection .fixup, "ax" -4: movl $0, (%esp) - jmp 1b -5: movl $0, (%esp) - jmp 2b -6: movl $0, (%esp) - jmp 3b +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b .popsection - _ASM_EXTABLE(1b,4b) - _ASM_EXTABLE(2b,5b) - _ASM_EXTABLE(3b,6b) + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 5b) + _ASM_EXTABLE(3b, 6b) POP_GS_EX .endm ENTRY(ret_from_fork) - pushl %eax - call schedule_tail + pushl %eax + call schedule_tail GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags + popl %eax + pushl $0x0202 # Reset kernel eflags popfl - jmp syscall_exit + jmp syscall_exit END(ret_from_fork) ENTRY(ret_from_kernel_thread) - pushl %eax - call schedule_tail + pushl %eax + call schedule_tail GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags + popl %eax + pushl $0x0202 # Reset kernel eflags popfl - movl PT_EBP(%esp),%eax - call *PT_EBX(%esp) - movl $0,PT_EAX(%esp) - jmp syscall_exit + movl PT_EBP(%esp), %eax + call *PT_EBX(%esp) + movl $0, PT_EAX(%esp) + jmp syscall_exit ENDPROC(ret_from_kernel_thread) /* @@ -264,62 +251,65 @@ ret_from_exception: ret_from_intr: GET_THREAD_INFO(%ebp) #ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax #else /* * We can be coming here from child spawned by kernel_thread(). */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax #endif - cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending - jmp restore_all + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) need_resched: - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz restore_all - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all - call preempt_schedule_irq - jmp need_resched + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz restore_all + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched END(resume_kernel) #endif -/* SYSENTER_RETURN points to after the "sysenter" instruction in - the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ +/* + * SYSENTER_RETURN points to after the SYSENTER instruction + * in the vsyscall page. See vsyscall-sysentry.S, which defines + * the symbol. + */ - # sysenter call handler stub + # SYSENTER call handler stub ENTRY(entry_SYSENTER_32) - movl TSS_sysenter_sp0(%esp),%esp + movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: /* * Interrupts are disabled here, but we can't trace it until * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl $__USER_DS - pushl %ebp + pushl $__USER_DS + pushl %ebp pushfl - orl $X86_EFLAGS_IF, (%esp) - pushl $__USER_CS + orl $X86_EFLAGS_IF, (%esp) + pushl $__USER_CS /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary: TI_sysenter_return @@ -328,9 +318,9 @@ sysenter_past_esp: * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; * and THREAD_SIZE takes us to the bottom. */ - pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - pushl %eax + pushl %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -338,132 +328,134 @@ sysenter_past_esp: * Load the potential sixth argument from user stack. * Careful about security. */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault + cmpl $__PAGE_OFFSET-3, %ebp + jae syscall_fault ASM_STAC -1: movl (%ebp),%ebp +1: movl (%ebp), %ebp ASM_CLAC - movl %ebp,PT_EBP(%esp) - _ASM_EXTABLE(1b,syscall_fault) + movl %ebp, PT_EBP(%esp) + _ASM_EXTABLE(1b, syscall_fault) GET_THREAD_INFO(%ebp) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz sysenter_audit + testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) + jnz sysenter_audit sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(,%eax,4) + cmpl $(NR_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(, %eax, 4) sysenter_after_call: - movl %eax,PT_EAX(%esp) + movl %eax, PT_EAX(%esp) LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jnz sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp,%ebp + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp, %ebp TRACE_IRQS_ON -1: mov PT_FS(%esp), %fs +1: mov PT_FS(%esp), %fs PTGS_TO_GS ENABLE_INTERRUPTS_SYSEXIT #ifdef CONFIG_AUDITSYSCALL sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl PT_ESI(%esp) /* a3: 5th arg */ - pushl PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl %ecx /* get that remapped edx off the stack */ - popl %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp),%eax /* reload syscall number */ - jmp sysenter_do_call + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp) + jnz syscall_trace_entry + /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ + movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ + /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ + call __audit_syscall_entry + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ + movl PT_EAX(%esp), %eax /* reload syscall number */ + jmp sysenter_do_call sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax,%edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al,%eax /* zero-extend that */ - call __audit_syscall_exit + movl %eax, %edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al, %eax /* zero-extend that */ + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp),%eax /* reload syscall return value */ - jmp sysenter_exit + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + movl PT_EAX(%esp), %eax /* reload syscall return value */ + jmp sysenter_exit #endif -.pushsection .fixup,"ax" -2: movl $0,PT_FS(%esp) - jmp 1b +.pushsection .fixup, "ax" +2: movl $0, PT_FS(%esp) + jmp 1b .popsection - _ASM_EXTABLE(1b,2b) + _ASM_EXTABLE(1b, 2b) PTGS_TO_GS_EX ENDPROC(entry_SYSENTER_32) # system call handler stub ENTRY(entry_INT80_32) ASM_CLAC - pushl %eax # save orig_eax + pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys syscall_call: - call *sys_call_table(,%eax,4) + call *sys_call_table(, %eax, 4) syscall_after_call: - movl %eax,PT_EAX(%esp) # store the return value + movl %eax, PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jnz syscall_exit_work restore_all: TRACE_IRQS_IRET restore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: PT_OLDSS(%esp) contains the wrong/random values if we - # are returning to the kernel. - # See comments in process.c:copy_thread() for details. - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + /* + * Warning: PT_OLDSS(%esp) contains the wrong/random values if we + * are returning to the kernel. + * See comments in process.c:copy_thread() for details. + */ + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + je ldt_ss # returning to user-space with LDT SS #endif restore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code + RESTORE_REGS 4 # skip orig_eax/error_code irq_return: INTERRUPT_RETURN -.section .fixup,"ax" -ENTRY(iret_exc) - pushl $0 # no error code - pushl $do_iret_error - jmp error_code +.section .fixup, "ax" +ENTRY(iret_exc ) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code .previous - _ASM_EXTABLE(irq_return,iret_exc) + _ASM_EXTABLE(irq_return, iret_exc) #ifdef CONFIG_X86_ESPFIX32 ldt_ss: @@ -476,8 +468,8 @@ ldt_ss: * is still available to implement the setting of the high * 16-bits in the INTERRUPT_RETURN paravirt-op. */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck #endif /* @@ -492,21 +484,23 @@ ldt_ss: * a base address that matches for the difference. */ #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - pushl %eax /* new kernel esp */ - /* Disable interrupts, but do not irqtrace this section: we + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* + * Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to - * the irqstate after the iret */ + * the irqstate after the IRET: + */ DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck + lss (%esp), %esp /* switch to espfix segment */ + jmp restore_nocheck #endif ENDPROC(entry_INT80_32) @@ -514,93 +508,93 @@ ENDPROC(entry_INT80_32) ALIGN work_pending: testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig + jz work_notifysig work_resched: - call schedule + call schedule LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all testb $_TIF_NEED_RESCHED, %cl - jnz work_resched + jnz work_resched -work_notifysig: # deal with pending signals and - # notify-resume requests +work_notifysig: # deal with pending signals and + # notify-resume requests #ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jnz work_notifysig_v86 # returning to kernel-space or + # vm86-space 1: #else - movl %esp, %eax + movl %esp, %eax #endif TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl + movb PT_CS(%esp), %bl andb $SEGMENT_RPL_MASK, %bl cmpb $USER_RPL, %bl - jb resume_kernel - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace + jb resume_kernel + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace #ifdef CONFIG_VM86 ALIGN work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - jmp 1b + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + jmp 1b #endif END(work_pending) # perform syscall exit tracing ALIGN syscall_trace_entry: - movl $-ENOSYS,PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter + movl $-ENOSYS, PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead - movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace END(syscall_exit_work) syscall_fault: ASM_CLAC GET_THREAD_INFO(%ebp) - movl $-EFAULT,PT_EAX(%esp) - jmp resume_userspace + movl $-EFAULT, PT_EAX(%esp) + jmp resume_userspace END(syscall_fault) syscall_badsys: - movl $-ENOSYS,%eax - jmp syscall_after_call + movl $-ENOSYS, %eax + jmp syscall_after_call END(syscall_badsys) sysenter_badsys: - movl $-ENOSYS,%eax - jmp sysenter_after_call + movl $-ENOSYS, %eax + jmp sysenter_after_call END(sysenter_badsys) .macro FIXUP_ESPFIX_STACK @@ -613,24 +607,24 @@ END(sysenter_badsys) */ #ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax - addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - pushl %eax - lss (%esp), %esp /* switch to the normal stack segment */ + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + pushl %eax + lss (%esp), %esp /* switch to the normal stack segment */ #endif .endm .macro UNWIND_ESPFIX_STACK #ifdef CONFIG_X86_ESPFIX32 - movl %ss, %eax + movl %ss, %eax /* see if on espfix stack */ - cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es /* switch to normal stack */ FIXUP_ESPFIX_STACK 27: @@ -645,7 +639,7 @@ END(sysenter_badsys) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ + pushl $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt .align 8 @@ -659,35 +653,34 @@ END(irq_entries_start) .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: ASM_CLAC - addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ SAVE_ALL TRACE_IRQS_OFF - movl %esp,%eax - call do_IRQ - jmp ret_from_intr + movl %esp, %eax + call do_IRQ + jmp ret_from_intr ENDPROC(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ ASM_CLAC; \ - pushl $~(nr); \ + pushl $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ - movl %esp,%eax; \ - call fn; \ - jmp ret_from_intr; \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ ENDPROC(name) #ifdef CONFIG_TRACING -#define TRACE_BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) #else -#define TRACE_BUILD_INTERRUPT(name, nr) +# define TRACE_BUILD_INTERRUPT(name, nr) #endif -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ @@ -695,30 +688,30 @@ ENDPROC(name) ENTRY(coprocessor_error) ASM_CLAC - pushl $0 - pushl $do_coprocessor_error - jmp error_code + pushl $0 + pushl $do_coprocessor_error + jmp error_code END(coprocessor_error) ENTRY(simd_coprocessor_error) ASM_CLAC - pushl $0 + pushl $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ + ALTERNATIVE "pushl $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ X86_FEATURE_XMM #else - pushl $do_simd_coprocessor_error + pushl $do_simd_coprocessor_error #endif - jmp error_code + jmp error_code END(simd_coprocessor_error) ENTRY(device_not_available) ASM_CLAC - pushl $-1 # mark this as an int - pushl $do_device_not_available - jmp error_code + pushl $-1 # mark this as an int + pushl $do_device_not_available + jmp error_code END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -735,165 +728,171 @@ END(native_irq_enable_sysexit) ENTRY(overflow) ASM_CLAC - pushl $0 - pushl $do_overflow - jmp error_code + pushl $0 + pushl $do_overflow + jmp error_code END(overflow) ENTRY(bounds) ASM_CLAC - pushl $0 - pushl $do_bounds - jmp error_code + pushl $0 + pushl $do_bounds + jmp error_code END(bounds) ENTRY(invalid_op) ASM_CLAC - pushl $0 - pushl $do_invalid_op - jmp error_code + pushl $0 + pushl $do_invalid_op + jmp error_code END(invalid_op) ENTRY(coprocessor_segment_overrun) ASM_CLAC - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp error_code + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code END(coprocessor_segment_overrun) ENTRY(invalid_TSS) ASM_CLAC - pushl $do_invalid_TSS - jmp error_code + pushl $do_invalid_TSS + jmp error_code END(invalid_TSS) ENTRY(segment_not_present) ASM_CLAC - pushl $do_segment_not_present - jmp error_code + pushl $do_segment_not_present + jmp error_code END(segment_not_present) ENTRY(stack_segment) ASM_CLAC - pushl $do_stack_segment - jmp error_code + pushl $do_stack_segment + jmp error_code END(stack_segment) ENTRY(alignment_check) ASM_CLAC - pushl $do_alignment_check - jmp error_code + pushl $do_alignment_check + jmp error_code END(alignment_check) ENTRY(divide_error) ASM_CLAC - pushl $0 # no error code - pushl $do_divide_error - jmp error_code + pushl $0 # no error code + pushl $do_divide_error + jmp error_code END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) ASM_CLAC - pushl $0 - pushl machine_check_vector - jmp error_code + pushl $0 + pushl machine_check_vector + jmp error_code END(machine_check) #endif ENTRY(spurious_interrupt_bug) ASM_CLAC - pushl $0 - pushl $do_spurious_interrupt_bug - jmp error_code + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code END(spurious_interrupt_bug) #ifdef CONFIG_XEN -/* Xen doesn't set %esp to be precisely what the normal sysenter - entrypoint expects, so fix it up before using the normal path. */ +/* + * Xen doesn't set %esp to be precisely what the normal SYSENTER + * entry point expects, so fix it up before using the normal path. + */ ENTRY(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp ENTRY(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ + pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL TRACE_IRQS_OFF - /* Check to see if we got the event in the critical - region in xen_iret_direct, after we've reenabled - events and checked for pending events. This simulates - iret instruction's behaviour where it delivers a - pending interrupt when enabling interrupts. */ - movl PT_EIP(%esp),%eax - cmpl $xen_iret_start_crit,%eax - jb 1f - cmpl $xen_iret_end_crit,%eax - jae 1f + /* + * Check to see if we got the event in the critical + * region in xen_iret_direct, after we've reenabled + * events and checked for pending events. This simulates + * iret instruction's behaviour where it delivers a + * pending interrupt when enabling interrupts: + */ + movl PT_EIP(%esp), %eax + cmpl $xen_iret_start_crit, %eax + jb 1f + cmpl $xen_iret_end_crit, %eax + jae 1f - jmp xen_iret_crit_fixup + jmp xen_iret_crit_fixup ENTRY(xen_do_upcall) -1: mov %esp, %eax - call xen_evtchn_do_upcall +1: mov %esp, %eax + call xen_evtchn_do_upcall #ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall + call xen_maybe_preempt_hcall #endif - jmp ret_from_intr + jmp ret_from_intr ENDPROC(xen_hypervisor_callback) -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we fix up by reattempting the load, and zeroing the segment -# register if the load fails. -# Category 2 we fix up by jumping to do_iret_error. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by maintaining a status value in EAX. +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we fix up by reattempting the load, and zeroing the segment + * register if the load fails. + * Category 2 we fix up by jumping to do_iret_error. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by maintaining a status value in EAX. + */ ENTRY(xen_failsafe_callback) - pushl %eax - movl $1,%eax -1: mov 4(%esp),%ds -2: mov 8(%esp),%es -3: mov 12(%esp),%fs -4: mov 16(%esp),%gs + pushl %eax + movl $1, %eax +1: mov 4(%esp), %ds +2: mov 8(%esp), %es +3: mov 12(%esp), %fs +4: mov 16(%esp), %gs /* EAX == 0 => Category 1 (Bad segment) EAX != 0 => Category 2 (Bad IRET) */ - testl %eax,%eax - popl %eax - lea 16(%esp),%esp - jz 5f - jmp iret_exc -5: pushl $-1 /* orig_ax = -1 => not a system call */ + testl %eax, %eax + popl %eax + lea 16(%esp), %esp + jz 5f + jmp iret_exc +5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL - jmp ret_from_exception - -.section .fixup,"ax" -6: xorl %eax,%eax - movl %eax,4(%esp) - jmp 1b -7: xorl %eax,%eax - movl %eax,8(%esp) - jmp 2b -8: xorl %eax,%eax - movl %eax,12(%esp) - jmp 3b -9: xorl %eax,%eax - movl %eax,16(%esp) - jmp 4b + jmp ret_from_exception + +.section .fixup, "ax" +6: xorl %eax, %eax + movl %eax, 4(%esp) + jmp 1b +7: xorl %eax, %eax + movl %eax, 8(%esp) + jmp 2b +8: xorl %eax, %eax + movl %eax, 12(%esp) + jmp 3b +9: xorl %eax, %eax + movl %eax, 16(%esp) + jmp 4b .previous - _ASM_EXTABLE(1b,6b) - _ASM_EXTABLE(2b,7b) - _ASM_EXTABLE(3b,8b) - _ASM_EXTABLE(4b,9b) + _ASM_EXTABLE(1b, 6b) + _ASM_EXTABLE(2b, 7b) + _ASM_EXTABLE(3b, 8b) + _ASM_EXTABLE(4b, 9b) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, xen_evtchn_do_upcall) -#endif /* CONFIG_XEN */ +#endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) @@ -910,28 +909,28 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax .globl ftrace_call ftrace_call: - call ftrace_stub + call ftrace_stub - addl $4,%esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax ftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: - jmp ftrace_stub + jmp ftrace_stub #endif .globl ftrace_stub @@ -949,72 +948,72 @@ ENTRY(ftrace_regs_caller) * as the current return ip is. We move the return ip into the * ip location, and move flags into the return ip location. */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS,13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS, 13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret popf - jmp ftrace_stub + jmp ftrace_stub #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ - cmpl $ftrace_stub, ftrace_trace_function - jnz trace + cmpl $ftrace_stub, ftrace_trace_function + jnz trace #ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: @@ -1022,92 +1021,92 @@ ftrace_stub: /* taken from glibc */ trace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax ret END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx #endif #ifdef CONFIG_TRACING ENTRY(trace_page_fault) ASM_CLAC - pushl $trace_do_page_fault - jmp error_code + pushl $trace_do_page_fault + jmp error_code END(trace_page_fault) #endif ENTRY(page_fault) ASM_CLAC - pushl $do_page_fault + pushl $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs UNWIND_ESPFIX_STACK GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception + movl %esp, %eax # pt_regs pointer + call *%edi + jmp ret_from_exception END(page_fault) /* @@ -1124,28 +1123,28 @@ END(page_fault) * the instruction that would have done it for sysenter. */ .macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok + cmpw $__KERNEL_CS, 4(%esp) + jne \ok \label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp + movl TSS_sysenter_sp0 + \offset(%esp), %esp pushfl - pushl $__KERNEL_CS - pushl $sysenter_past_esp + pushl $__KERNEL_CS + pushl $sysenter_past_esp .endm ENTRY(debug) ASM_CLAC - cmpl $entry_SYSENTER_32,(%esp) - jne debug_stack_correct + cmpl $entry_SYSENTER_32, (%esp) + jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: - pushl $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception + xorl %edx, %edx # error code 0 + movl %esp, %eax # pt_regs pointer + call do_debug + jmp ret_from_exception END(debug) /* @@ -1159,91 +1158,91 @@ END(debug) ENTRY(nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 - pushl %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - je nmi_espfix_stack + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je nmi_espfix_stack #endif - cmpl $entry_SYSENTER_32,(%esp) - je nmi_stack_fixup - pushl %eax - movl %esp,%eax - /* Do not access memory above the end of our stack page, + cmpl $entry_SYSENTER_32, (%esp) + je nmi_stack_fixup + pushl %eax + movl %esp, %eax + /* + * Do not access memory above the end of our stack page, * it might not exist. */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - jae nmi_stack_correct - cmpl $entry_SYSENTER_32,12(%esp) - je nmi_debug_stack_check + andl $(THREAD_SIZE-1), %eax + cmpl $(THREAD_SIZE-20), %eax + popl %eax + jae nmi_stack_correct + cmpl $entry_SYSENTER_32, 12(%esp) + je nmi_debug_stack_check nmi_stack_correct: - pushl %eax + pushl %eax SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_all_notrace + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace nmi_stack_fixup: FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct + jmp nmi_stack_correct nmi_debug_stack_check: - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct + cmpw $__KERNEL_CS, 16(%esp) + jne nmi_stack_correct + cmpl $debug, (%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn, (%esp) + ja nmi_stack_correct FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct + jmp nmi_stack_correct #ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: /* * create the pointer to lss back */ - pushl %ss - pushl %esp - addl $4, (%esp) + pushl %ss + pushl %esp + addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl 16(%esp) + pushl 16(%esp) .endr - pushl %eax + pushl %eax SAVE_ALL - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx, %edx # zero error code + call do_nmi RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return + lss 12+4(%esp), %esp # back to espfix stack + jmp irq_return #endif END(nmi) ENTRY(int3) ASM_CLAC - pushl $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_int3 + jmp ret_from_exception END(int3) ENTRY(general_protection) - pushl $do_general_protection - jmp error_code + pushl $do_general_protection + jmp error_code END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) ASM_CLAC - pushl $do_async_page_fault - jmp error_code + pushl $do_async_page_fault + jmp error_code END(async_page_fault) #endif - -- cgit v1.2.1 From 633adc711de0bcb6d6e1c071302880e0c8c05d57 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 4 Jun 2015 16:37:56 -0500 Subject: PCI: Remove unnecessary #includes of In include/linux/pci.h, we already #include , so we don't need to include directly. Remove the unnecessary includes. All the files here already include . Signed-off-by: Bjorn Helgaas Acked-by: Simon Horman # sh Acked-by: Ralf Baechle --- arch/x86/kernel/x86_init.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 234b0722de53..eed5625b10e8 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.1 From 01d72a95188880b22190e937ed8718ed4b45bdce Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 4 Jun 2015 16:38:17 -0500 Subject: PCI: Remove unused pci_dma_burst_advice() pci_dma_burst_advice() was added by e24c2d963a60 ("[PATCH] PCI: DMA bursting advice") but apparently never used. Remove it. Signed-off-by: Bjorn Helgaas Acked-by: Michal Simek # microblaze CC: David S. Miller --- arch/x86/include/asm/pci.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 4e370a5d8117..e51c229a29a4 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -80,13 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, #ifdef CONFIG_PCI extern void early_quirks(void); -static inline void pci_dma_burst_advice(struct pci_dev *pdev, - enum pci_dma_burst_strategy *strat, - unsigned long *strategy_parameter) -{ - *strat = PCI_DMA_BURST_INFINITY; - *strategy_parameter = ~0UL; -} #else static inline void early_quirks(void) { } #endif -- cgit v1.2.1 From 4d7321381e5c7102a3d3faf0a0a0035a09619612 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 20:43:07 +0200 Subject: x86/asm/entry/64: Clean up entry_64.S Make the 64-bit syscall entry code a bit more readable: - use consistent assembly coding style similar to the other entry_*.S files - remove old comments that are not true anymore - eliminate whitespace noise - use consistent vertical spacing - fix various comments - reorganize entry point generation tables to be more readable No code changed: # arch/x86/entry/entry_64.o: text data bss dec hex filename 12282 0 0 12282 2ffa entry_64.o.before 12282 0 0 12282 2ffa entry_64.o.after md5: cbab1f2d727a2a8a87618eeb79f391b7 entry_64.o.before.asm cbab1f2d727a2a8a87618eeb79f391b7 entry_64.o.after.asm Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 820 +++++++++++++++++++++++----------------------- 1 file changed, 404 insertions(+), 416 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2a0ed211bed..bd97161f90cb 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -4,26 +4,20 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek - */ - -/* + * * entry.S contains the system-call and fault low-level handling routines. * * Some of this is documented in Documentation/x86/entry_64.txt * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * * A note on terminology: - * - iret frame: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. + * - iret frame: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. * * Some macro usage: - * - ENTRY/END Define functions in the symbol table. - * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - idtentry - Define exception entry points. + * - ENTRY/END: Define functions in the symbol table. + * - TRACE_IRQ_*: Trace hardirq state for lock debugging. + * - idtentry: Define exception entry points. */ - #include #include #include @@ -46,13 +40,12 @@ /* Avoid __ASSEMBLER__'ifying just for this. */ #include -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - - .code64 - .section .entry.text, "ax" +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 +.code64 +.section .entry.text, "ax" #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) @@ -61,11 +54,10 @@ ENTRY(native_usergs_sysret64) ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ - .macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f + bt $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f TRACE_IRQS_ON 1: #endif @@ -85,34 +77,34 @@ ENDPROC(native_usergs_sysret64) #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) .macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero + call debug_stack_set_zero TRACE_IRQS_OFF - call debug_stack_reset + call debug_stack_reset .endm .macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero + call debug_stack_set_zero TRACE_IRQS_ON - call debug_stack_reset + call debug_stack_reset .endm .macro TRACE_IRQS_IRETQ_DEBUG - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f + bt $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f TRACE_IRQS_ON_DEBUG 1: .endm #else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ #endif /* - * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. + * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * - * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC * are not needed). SYSCALL does not save anything on the stack @@ -128,7 +120,7 @@ ENDPROC(native_usergs_sysret64) * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 - * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) + * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) * * Only called from user space. * @@ -151,12 +143,12 @@ ENTRY(entry_SYSCALL_64) */ GLOBAL(entry_SYSCALL_64_after_swapgs) - movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + movq %rsp, PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* * Re-enable interrupts. * We use 'rsp_scratch' as a scratch space, hence irq-off block above @@ -165,34 +157,34 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - pushq %r9 /* pt_regs->r9 */ - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz tracesys entry_SYSCALL_64_fastpath: #if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax + cmpq $__NR_syscall_max, %rax #else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax + andl $__SYSCALL_MASK, %eax + cmpl $__NR_syscall_max, %eax #endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10, %rcx + call *sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: /* * Syscall return path ending with SYSRET (fast path). @@ -213,15 +205,15 @@ entry_SYSCALL_64_fastpath: * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is * very bad. */ - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp),%rcx - movq EFLAGS(%rsp),%r11 - movq RSP(%rsp),%rsp + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + movq RSP(%rsp), %rsp /* - * 64bit SYSRET restores rip from rcx, + * 64-bit SYSRET restores rip from rcx, * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. * Restoration of rflags re-enables interrupts. @@ -239,21 +231,21 @@ entry_SYSCALL_64_fastpath: /* Do syscall entry tracing */ tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + call syscall_trace_enter_phase1 + test %rax, %rax + jnz tracesys_phase2 /* if needed, run the slow path */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax + jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ tracesys_phase2: SAVE_EXTRA_REGS - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax,%rdx - call syscall_trace_enter_phase2 + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + movq %rax, %rdx + call syscall_trace_enter_phase2 /* * Reload registers from stack in case ptrace changed them. @@ -263,15 +255,15 @@ tracesys_phase2: RESTORE_C_REGS_EXCEPT_RAX RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax + cmpq $__NR_syscall_max, %rax #else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax + andl $__SYSCALL_MASK, %eax + cmpl $__NR_syscall_max, %eax #endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10, %rcx /* fixup for C */ + call *sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: /* Use IRET because user could have changed pt_regs->foo */ @@ -283,31 +275,33 @@ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK,%edi + movl $_TIF_ALLWORK_MASK, %edi /* edi: mask to check */ GLOBAL(int_with_check) LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) + movl TI_flags(%rcx), %edx + andl %edi, %edx + jnz int_careful + andl $~TS_COMPAT, TI_status(%rcx) jmp syscall_return - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ + /* + * Either reschedule or signal or syscall exit tracking needed. + * First do a reschedule test. + * edx: work, edi: workmask + */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + bt $TIF_NEED_RESCHED, %edx + jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi + pushq %rdi SCHEDULE_USER - popq %rdi + popq %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp int_with_check + jmp int_with_check /* handle signals and tracing -- both require a full pt_regs */ int_very_careful: @@ -315,27 +309,27 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT,%edx - jz int_signal - pushq %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi - jmp int_restore_rest + testl $_TIF_WORK_SYSCALL_EXIT, %edx + jz int_signal + pushq %rdi + leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ + call syscall_trace_leave + popq %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi + jmp int_restore_rest int_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_WORK_MASK,%edi + testl $_TIF_DO_NOTIFY_MASK, %edx + jz 1f + movq %rsp, %rdi /* &ptregs -> arg1 */ + xorl %esi, %esi /* oldset -> arg2 */ + call do_notify_resume +1: movl $_TIF_WORK_MASK, %edi int_restore_rest: RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp int_with_check + jmp int_with_check syscall_return: /* The IRETQ could re-enable interrupts: */ @@ -346,10 +340,10 @@ syscall_return: * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. */ - movq RCX(%rsp),%rcx - movq RIP(%rsp),%r11 - cmpq %rcx,%r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + movq RCX(%rsp), %rcx + movq RIP(%rsp), %r11 + cmpq %rcx, %r11 /* RCX == RIP */ + jne opportunistic_sysret_failed /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -362,19 +356,21 @@ syscall_return: .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif + /* Change top 16 bits to be the sign-extension of 47th bit */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 jne opportunistic_sysret_failed - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + movq R11(%rsp), %r11 + cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed /* * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, @@ -383,29 +379,29 @@ syscall_return: * with register state that satisfies the opportunistic SYSRET * conditions. For example, single-stepping this user code: * - * movq $stuck_here,%rcx + * movq $stuck_here, %rcx * pushfq * popq %r11 * stuck_here: * * would never get past 'stuck_here'. */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed /* nothing to check for RSP */ - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp),%rsp + movq RSP(%rsp), %rsp USERGS_SYSRET64 opportunistic_sysret_failed: @@ -417,7 +413,7 @@ END(entry_SYSCALL_64) .macro FORK_LIKE func ENTRY(stub_\func) SAVE_EXTRA_REGS 8 - jmp sys_\func + jmp sys_\func END(stub_\func) .endm @@ -436,7 +432,7 @@ return_from_execve: /* must use IRET code path (pt_regs->cs may have changed) */ addq $8, %rsp ZERO_EXTRA_REGS - movq %rax,RAX(%rsp) + movq %rax, RAX(%rsp) jmp int_ret_from_sys_call END(stub_execve) /* @@ -479,19 +475,19 @@ ENTRY(stub_rt_sigreturn) * we SAVE_EXTRA_REGS here. */ SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn + call sys_rt_sigreturn return_from_stub: addq $8, %rsp RESTORE_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call + movq %rax, RAX(%rsp) + jmp int_ret_from_sys_call END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub + call sys32_x32_rt_sigreturn + jmp return_from_stub END(stub_x32_rt_sigreturn) #endif @@ -502,16 +498,16 @@ END(stub_x32_rt_sigreturn) */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK,TI_flags(%r8) + LOCK ; btr $TIF_FORK, TI_flags(%r8) - pushq $0x0002 - popfq # reset kernel eflags + pushq $0x0002 + popfq /* reset kernel eflags */ - call schedule_tail # rdi: 'prev' task parameter + call schedule_tail /* rdi: 'prev' task parameter */ RESTORE_EXTRA_REGS - testb $3, CS(%rsp) # from kernel_thread? + testb $3, CS(%rsp) /* from kernel_thread? */ /* * By the time we get here, we have no idea whether our pt_regs, @@ -522,13 +518,15 @@ ENTRY(ret_from_fork) */ jnz int_ret_from_sys_call - /* We came from kernel_thread */ - /* nb: we depend on RESTORE_EXTRA_REGS above */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) + /* + * We came from kernel_thread + * nb: we depend on RESTORE_EXTRA_REGS above + */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + jmp int_ret_from_sys_call END(ret_from_fork) /* @@ -539,7 +537,7 @@ END(ret_from_fork) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq $(~vector+0x80) /* Note: always in signed byte range */ + pushq $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt .align 8 @@ -569,7 +567,7 @@ END(irq_entries_start) /* this goes to 0(%rsp) for unwinder, not for saving the value: */ SAVE_EXTRA_REGS_RBP -RBP - leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ + leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ testb $3, CS-RBP(%rsp) jz 1f @@ -582,14 +580,14 @@ END(irq_entries_start) * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ - movq %rsp, %rsi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rsi + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + pushq %rsi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF - call \func + call \func .endm /* @@ -599,36 +597,35 @@ END(irq_entries_start) .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: ASM_CLAC - addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ interrupt do_IRQ /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) + decl PER_CPU_VAR(irq_count) /* Restore saved previous stack */ - popq %rsi + popq %rsi /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi),%rsp + leaq -RBP(%rsi), %rsp testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ retint_user: GET_THREAD_INFO(%rcx) - /* - * %rcx: thread info. Interrupts off. - */ + + /* %rcx: thread info. Interrupts are off. */ retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK, %edi retint_check: LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz retint_careful + movl TI_flags(%rcx), %edx + andl %edi, %edx + jnz retint_careful -retint_swapgs: /* return to user-space */ +retint_swapgs: /* return to user-space */ /* * The iretq could re-enable interrupts: */ @@ -643,9 +640,9 @@ retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ - bt $9,EFLAGS(%rsp) /* interrupts were off? */ + bt $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f -0: cmpl $0,PER_CPU_VAR(__preempt_count) +0: cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f call preempt_schedule_irq jmp 0b @@ -671,8 +668,8 @@ ENTRY(native_iret) * 64-bit mode SS:RSP on the exception stack is always valid. */ #ifdef CONFIG_X86_ESPFIX64 - testb $4,(SS-RIP)(%rsp) - jnz native_irq_return_ldt + testb $4, (SS-RIP)(%rsp) + jnz native_irq_return_ldt #endif .global native_irq_return_iret @@ -687,59 +684,59 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq %rax - pushq %rdi + pushq %rax + pushq %rdi SWAPGS - movq PER_CPU_VAR(espfix_waddr),%rdi - movq %rax,(0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp),%rax /* RIP */ - movq %rax,(1*8)(%rdi) - movq (3*8)(%rsp),%rax /* CS */ - movq %rax,(2*8)(%rdi) - movq (4*8)(%rsp),%rax /* RFLAGS */ - movq %rax,(3*8)(%rdi) - movq (6*8)(%rsp),%rax /* SS */ - movq %rax,(5*8)(%rdi) - movq (5*8)(%rsp),%rax /* RSP */ - movq %rax,(4*8)(%rdi) - andl $0xffff0000,%eax - popq %rdi - orq PER_CPU_VAR(espfix_stack),%rax + movq PER_CPU_VAR(espfix_waddr), %rdi + movq %rax, (0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp), %rax /* RIP */ + movq %rax, (1*8)(%rdi) + movq (3*8)(%rsp), %rax /* CS */ + movq %rax, (2*8)(%rdi) + movq (4*8)(%rsp), %rax /* RFLAGS */ + movq %rax, (3*8)(%rdi) + movq (6*8)(%rsp), %rax /* SS */ + movq %rax, (5*8)(%rdi) + movq (5*8)(%rsp), %rax /* RSP */ + movq %rax, (4*8)(%rdi) + andl $0xffff0000, %eax + popq %rdi + orq PER_CPU_VAR(espfix_stack), %rax SWAPGS - movq %rax,%rsp - popq %rax - jmp native_irq_return_iret + movq %rax, %rsp + popq %rax + jmp native_irq_return_iret #endif /* edi: workmask, edx: work */ retint_careful: - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + bt $TIF_NEED_RESCHED, %edx + jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi + pushq %rdi SCHEDULE_USER - popq %rdi + popq %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp retint_check + jmp retint_check retint_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz retint_swapgs + testl $_TIF_DO_NOTIFY_MASK, %edx + jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume + movq $-1, ORIG_RAX(%rsp) + xorl %esi, %esi /* oldset */ + movq %rsp, %rdi /* &pt_regs */ + call do_notify_resume RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule + jmp retint_with_reschedule END(common_interrupt) @@ -749,10 +746,10 @@ END(common_interrupt) .macro apicinterrupt3 num sym do_sym ENTRY(\sym) ASM_CLAC - pushq $~(\num) + pushq $~(\num) .Lcommon_\sym: interrupt \do_sym - jmp ret_from_intr + jmp ret_from_intr END(\sym) .endm @@ -774,60 +771,45 @@ trace_apicinterrupt \num \sym .endm #ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ - irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR \ - reboot_interrupt smp_reboot_interrupt +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE \ - uv_bau_message_intr1 uv_bau_message_interrupt +apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt #endif -apicinterrupt LOCAL_TIMER_VECTOR \ - apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR \ - x86_platform_ipi smp_x86_platform_ipi + +apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR \ - kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \ - kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi +apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt smp_threshold_interrupt +apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt #endif #ifdef CONFIG_X86_MCE_AMD -apicinterrupt DEFERRED_ERROR_VECTOR \ - deferred_error_interrupt smp_deferred_error_interrupt +apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt #endif #ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR \ - thermal_interrupt smp_thermal_interrupt +apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt #endif #ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ - call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR \ - call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR \ - reschedule_interrupt smp_reschedule_interrupt +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif -apicinterrupt ERROR_APIC_VECTOR \ - error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR \ - spurious_interrupt smp_spurious_interrupt +apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt #ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR \ - irq_work_interrupt smp_irq_work_interrupt +apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt #endif /* @@ -846,54 +828,54 @@ ENTRY(\sym) PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code - pushq $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 - testb $3, CS(%rsp) /* If coming from userspace, switch */ - jnz 1f /* stacks. */ + testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ + jnz 1f .endif - call paranoid_entry + call paranoid_entry .else - call error_entry + call error_entry .endif /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ .else TRACE_IRQS_OFF .endif .endif - movq %rsp,%rdi /* pt_regs pointer */ + movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .else - xorl %esi,%esi /* no error code */ + xorl %esi, %esi /* no error code */ .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif - call \do_sym + call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid - jmp paranoid_exit + jmp paranoid_exit .else - jmp error_exit + jmp error_exit .endif .if \paranoid == 1 @@ -903,25 +885,25 @@ ENTRY(\sym) * run in real process context if user_mode(regs). */ 1: - call error_entry + call error_entry - movq %rsp,%rdi /* pt_regs pointer */ - call sync_regs - movq %rax,%rsp /* switch stack */ + movq %rsp, %rdi /* pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ - movq %rsp,%rdi /* pt_regs pointer */ + movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .else - xorl %esi,%esi /* no error code */ + xorl %esi, %esi /* no error code */ .endif - call \do_sym + call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit /* %ebx: no swapgs flag */ .endif END(\sym) .endm @@ -937,55 +919,57 @@ idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* Reload gs selector with exception handling */ - /* edi: new selector */ +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + + + /* + * Reload gs selector with exception handling + * edi: new selector + */ ENTRY(native_load_gs_index) pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: - movl %edi,%gs -2: mfence /* workaround */ + movl %edi, %gs +2: mfence /* workaround */ SWAPGS popfq ret END(native_load_gs_index) - _ASM_EXTABLE(gs_change,bad_gs) - .section .fixup,"ax" + _ASM_EXTABLE(gs_change, bad_gs) + .section .fixup, "ax" /* running with kernelgs */ bad_gs: - SWAPGS /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b + SWAPGS /* switch back to user gs */ + xorl %eax, %eax + movl %eax, %gs + jmp 2b .previous /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp,%rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr),%rsp - push %rbp # backlink for old unwinder - call __do_softirq + pushq %rbp + mov %rsp, %rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr), %rsp + push %rbp /* frame pointer backlink */ + call __do_softirq leaveq - decl PER_CPU_VAR(irq_count) + decl PER_CPU_VAR(irq_count) ret END(do_softirq_own_stack) @@ -1005,23 +989,24 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. */ -ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) +ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ + /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ - movq %rdi, %rsp # we don't return, adjust the stack frame -11: incl PER_CPU_VAR(irq_count) - movq %rsp,%rbp - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rbp # backlink for old unwinder - call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + movq %rdi, %rsp /* we don't return, adjust the stack frame */ +11: incl PER_CPU_VAR(irq_count) + movq %rsp, %rbp + cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + pushq %rbp /* frame pointer backlink */ + call xen_evtchn_do_upcall + popq %rsp + decl PER_CPU_VAR(irq_count) #ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall + call xen_maybe_preempt_hcall #endif - jmp error_exit + jmp error_exit END(xen_do_hypervisor_callback) /* @@ -1038,35 +1023,35 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) - movl %ds,%ecx - cmpw %cx,0x10(%rsp) - jne 1f - movl %es,%ecx - cmpw %cx,0x18(%rsp) - jne 1f - movl %fs,%ecx - cmpw %cx,0x20(%rsp) - jne 1f - movl %gs,%ecx - cmpw %cx,0x28(%rsp) - jne 1f + movl %ds, %ecx + cmpw %cx, 0x10(%rsp) + jne 1f + movl %es, %ecx + cmpw %cx, 0x18(%rsp) + jne 1f + movl %fs, %ecx + cmpw %cx, 0x20(%rsp) + jne 1f + movl %gs, %ecx + cmpw %cx, 0x28(%rsp) + jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $0 /* RIP */ - pushq %r11 - pushq %rcx - jmp general_protection + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx + jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $-1 /* orig_ax = -1 => not a system call */ + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS - jmp error_exit + jmp error_exit END(xen_failsafe_callback) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1079,21 +1064,25 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 + #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 + +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 + #ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif + #ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* @@ -1105,13 +1094,13 @@ ENTRY(paranoid_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 - movl $1,%ebx - movl $MSR_GS_BASE,%ecx + movl $1, %ebx + movl $MSR_GS_BASE, %ecx rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ + testl %edx, %edx + js 1f /* negative -> in kernel */ SWAPGS - xorl %ebx,%ebx + xorl %ebx, %ebx 1: ret END(paranoid_entry) @@ -1124,16 +1113,17 @@ END(paranoid_entry) * in syscall entry), so checking for preemption here would * be complicated. Fortunately, we there's no good reason * to try to handle preemption here. + * + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + testl %ebx, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore + jmp paranoid_exit_restore paranoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG paranoid_exit_restore: @@ -1151,7 +1141,7 @@ ENTRY(error_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 - xorl %ebx,%ebx + xorl %ebx, %ebx testb $3, CS+8(%rsp) jz error_kernelspace error_swapgs: @@ -1167,41 +1157,41 @@ error_sti: * for these here too. */ error_kernelspace: - incl %ebx - leaq native_irq_return_iret(%rip),%rcx - cmpq %rcx,RIP+8(%rsp) - je error_bad_iret - movl %ecx,%eax /* zero extend */ - cmpq %rax,RIP+8(%rsp) - je bstep_iret - cmpq $gs_change,RIP+8(%rsp) - je error_swapgs - jmp error_sti + incl %ebx + leaq native_irq_return_iret(%rip), %rcx + cmpq %rcx, RIP+8(%rsp) + je error_bad_iret + movl %ecx, %eax /* zero extend */ + cmpq %rax, RIP+8(%rsp) + je bstep_iret + cmpq $gs_change, RIP+8(%rsp) + je error_swapgs + jmp error_sti bstep_iret: /* Fix truncated RIP */ - movq %rcx,RIP+8(%rsp) + movq %rcx, RIP+8(%rsp) /* fall through */ error_bad_iret: SWAPGS - mov %rsp,%rdi - call fixup_bad_iret - mov %rax,%rsp - decl %ebx /* Return to usergs */ - jmp error_sti + mov %rsp, %rdi + call fixup_bad_iret + mov %rax, %rsp + decl %ebx /* Return to usergs */ + jmp error_sti END(error_entry) /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) - movl %ebx,%eax + movl %ebx, %eax RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %eax,%eax - jnz retint_kernel - jmp retint_user + testl %eax, %eax + jnz retint_kernel + jmp retint_user END(error_exit) /* Runs on exception stack */ @@ -1240,21 +1230,21 @@ ENTRY(nmi) */ /* Use %rdx as our temp variable throughout */ - pushq %rdx + pushq %rdx /* * If %cs was not the kernel segment, then the NMI triggered in user * space, which means it is definitely not nested. */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi + cmpl $__KERNEL_CS, 16(%rsp) + jne first_nmi /* * Check the special variable on the stack to see if NMIs are * executing. */ - cmpl $1, -8(%rsp) - je nested_nmi + cmpl $1, -8(%rsp) + je nested_nmi /* * Now test if the previous stack was an NMI stack. @@ -1268,6 +1258,7 @@ ENTRY(nmi) cmpq %rdx, 4*8(%rsp) /* If the stack pointer is above the NMI stack, this is a normal NMI */ ja first_nmi + subq $EXCEPTION_STKSZ, %rdx cmpq %rdx, 4*8(%rsp) /* If it is below the NMI stack, it is a normal NMI */ @@ -1280,29 +1271,29 @@ nested_nmi: * It's about to repeat the NMI handler, so we are fine * with ignoring this one. */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp - leaq -10*8(%rsp), %rdx - pushq $__KERNEL_DS - pushq %rdx + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + leaq -10*8(%rsp), %rdx + pushq $__KERNEL_DS + pushq %rdx pushfq - pushq $__KERNEL_CS - pushq $repeat_nmi + pushq $__KERNEL_CS + pushq $repeat_nmi /* Put stack back */ - addq $(6*8), %rsp + addq $(6*8), %rsp nested_nmi_out: - popq %rdx + popq %rdx /* No need to check faults here */ INTERRUPT_RETURN @@ -1344,19 +1335,17 @@ first_nmi: * is also used by nested NMIs and can not be trusted on exit. */ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ - movq (%rsp), %rdx + movq (%rsp), %rdx /* Set the NMI executing variable on the stack. */ - pushq $1 + pushq $1 - /* - * Leave room for the "copied" frame - */ - subq $(5*8), %rsp + /* Leave room for the "copied" frame */ + subq $(5*8), %rsp /* Copy the stack frame to the Saved frame */ .rept 5 - pushq 11*8(%rsp) + pushq 11*8(%rsp) .endr /* Everything up to here is safe from nested NMIs */ @@ -1376,14 +1365,14 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 10*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ - addq $(10*8), %rsp + addq $(10*8), %rsp .rept 5 - pushq -6*8(%rsp) + pushq -6*8(%rsp) .endr - subq $(5*8), %rsp + subq $(5*8), %rsp end_repeat_nmi: /* @@ -1391,7 +1380,7 @@ end_repeat_nmi: * NMI if the first NMI took an exception and reset our iret stack * so that we repeat another NMI. */ - pushq $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK /* @@ -1401,7 +1390,7 @@ end_repeat_nmi: * setting NEED_RESCHED or anything that normal interrupts and * exceptions might do. */ - call paranoid_entry + call paranoid_entry /* * Save off the CR2 register. If we take a page fault in the NMI then @@ -1412,21 +1401,21 @@ end_repeat_nmi: * origin fault. Save it off and restore it if it changes. * Use the r12 callee-saved register. */ - movq %cr2, %r12 + movq %cr2, %r12 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp,%rdi - movq $-1,%rsi - call do_nmi + movq %rsp, %rdi + movq $-1, %rsi + call do_nmi /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 1: - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: @@ -1436,12 +1425,11 @@ nmi_restore: REMOVE_PT_GPREGS_FROM_STACK 6*8 /* Clear the NMI executing stack variable */ - movq $0, 5*8(%rsp) + movq $0, 5*8(%rsp) INTERRUPT_RETURN END(nmi) ENTRY(ignore_sysret) - mov $-ENOSYS,%eax + mov $-ENOSYS, %eax sysret END(ignore_sysret) - -- cgit v1.2.1 From eb47854415825a69b1c578e7487da571227ba25a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 7 Jun 2015 20:24:30 +0200 Subject: x86/asm/entry/32: Reinstate clearing of pt_regs->r8..r11 on EFAULT path I broke this recently when I changed pt_regs->r8..r11 clearing logic in INT 80 code path. There is a branch from SYSENTER/SYSCALL code to INT 80 code: if we fail to retrieve arg6, we return EFAULT. Before this patch, in this case we don't clear pt_regs->r8..r11. This patch fixes this. The resulting code is smaller and simpler. While at it, remove incorrect comment about syscall dispatching CALL insn: it does not use RIP-relative addressing form (the comment was meant to be "TODO: make this rip-relative", and morphed since then, dropping "TODO"). Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433701470-28800-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 59840e33d203..5757ddec35d6 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -413,9 +413,7 @@ END(entry_SYSCALL_compat) ia32_badarg: ASM_CLAC - movq $-EFAULT, %rax - jmp ia32_sysret - + movq $-EFAULT, RAX(%rsp) ia32_ret_from_sys_call: xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) @@ -486,9 +484,7 @@ ia32_do_call: cmpq $(IA32_NR_syscalls-1), %rax ja 1f - call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ - -ia32_sysret: + call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: jmp int_ret_from_sys_call -- cgit v1.2.1 From bace7117d3fb59a6ed7ea1aa6c8994df6a28a72a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 21:20:26 +0200 Subject: x86/asm/entry: (Re-)rename __NR_entry_INT80_compat_max to __NR_syscall_compat_max Brian Gerst noticed that I did a weird rename in the following commit: b2502b418e63 ("x86/asm/entry: Untangle 'system_call' into two entry points: entry_SYSCALL_64 and entry_INT80_32") which renamed __NR_ia32_syscall_max to __NR_entry_INT80_compat_max. Now the original name was a misnomer, but the new one is a misnomer as well, as all the 32-bit compat syscall entry points (sysenter, syscall) share the system call table, not just the INT80 based one. Rename it to __NR_syscall_compat_max. Reported-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscall_32.c | 6 +++--- arch/x86/kernel/asm-offsets_64.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index e398d033673f..8ea34f94e973 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,7 +10,7 @@ #else #define SYM(sym, compat) sym #define ia32_sys_call_table sys_call_table -#define __NR_entry_INT80_compat_max __NR_syscall_max +#define __NR_syscall_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_entry_INT80_compat_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_entry_INT80_compat_max] = &sys_ni_syscall, + [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, #include }; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 599afcf0005f..d8f42f902a0f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,7 +66,7 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); - DEFINE(__NR_entry_INT80_compat_max, sizeof(syscalls_ia32) - 1); + DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; -- cgit v1.2.1 From 15c1247953e8a45232ed5a5540f291d2d0a77665 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Jun 2015 11:40:28 +0200 Subject: Revert "perf/x86/intel/uncore: Move uncore_box_init() out of driver initialization" This reverts commit c05199e5a57a579fea1e8fa65e2b511ceb524ffc. Vince Weaver reported the following crash while perf fuzzing: [ 79.473121] kernel BUG at mm/vmalloc.c:1335! [ 79.694391] Call Trace: [ 79.696997] [ 79.699090] [] get_vm_area_caller+0x40/0x50 [ 79.705505] [] ? snb_uncore_imc_init_box+0x6d/0x90 [ 79.712414] [] __ioremap_caller+0x195/0x350 [ 79.718610] [] ? snb_uncore_imc_init_box+0x6d/0x90 [ 79.725462] [] ? debug_object_activate+0x14b/0x1e0 [ 79.732346] [] ioremap_nocache+0x17/0x20 [ 79.738283] [] snb_uncore_imc_init_box+0x6d/0x90 [ 79.744945] [] snb_uncore_imc_event_start+0xb7/0x110 [ 79.752020] [] snb_uncore_imc_event_add+0x47/0x60 [ 79.758832] [] event_sched_in.isra.85+0xfb/0x330 [ 79.765519] [] group_sched_in+0x6f/0x1e0 [ 79.771481] [] ? native_sched_clock+0x2a/0x90 [ 79.777858] [] __perf_event_enable+0x25c/0x2a0 [ 79.784418] [] ? tick_nohz_irq_exit+0x29/0x30 [ 79.790820] [] ? cpu_clock_event_start+0x40/0x40 [ 79.797546] [] remote_function+0x50/0x60 [ 79.803535] [] flush_smp_call_function_queue+0x81/0x180 [ 79.810840] [] generic_smp_call_function_single_interrupt+0x13/0x60 [ 79.819328] [] smp_trace_call_function_single_interrupt+0x38/0xc0 [ 79.827614] [] trace_call_function_single_interrupt+0x6e/0x80 [ 79.835465] [ 79.837543] [] ? cpuidle_enter_state+0x65/0x160 [ 79.844377] [] ? cpuidle_enter_state+0x51/0x160 [ 79.851015] [] cpuidle_enter+0x17/0x20 [ 79.856791] [] cpu_startup_entry+0x399/0x440 [ 79.863165] [] rest_init+0xbb/0xd0 The offending commit is clearly confused as it moves heavy initialization work into IPI context. Revert it. Reported-by: Vince Weaver Acked-by: Peter Zijlstra (Intel) Cc: Kan Liang Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: Yan, Zheng Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 9 +++++++-- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 18 ++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index dd319e59246b..90b7c501c95b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -839,6 +839,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id box->phys_id = phys_id; box->pci_dev = pdev; box->pmu = pmu; + uncore_box_init(box); pci_set_drvdata(pdev, box); raw_spin_lock(&uncore_box_lock); @@ -1002,8 +1003,10 @@ static int uncore_cpu_starting(int cpu) pmu = &type->pmus[j]; box = *per_cpu_ptr(pmu->box, cpu); /* called by uncore_cpu_init? */ - if (box && box->phys_id >= 0) + if (box && box->phys_id >= 0) { + uncore_box_init(box); continue; + } for_each_online_cpu(k) { exist = *per_cpu_ptr(pmu->box, k); @@ -1019,8 +1022,10 @@ static int uncore_cpu_starting(int cpu) } } - if (box) + if (box) { box->phys_id = phys_id; + uncore_box_init(box); + } } } return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index f789ec9a0133..ceac8f5dc018 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -258,14 +258,6 @@ static inline int uncore_num_counters(struct intel_uncore_box *box) return box->pmu->type->num_counters; } -static inline void uncore_box_init(struct intel_uncore_box *box) -{ - if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { - if (box->pmu->type->ops->init_box) - box->pmu->type->ops->init_box(box); - } -} - static inline void uncore_disable_box(struct intel_uncore_box *box) { if (box->pmu->type->ops->disable_box) @@ -274,8 +266,6 @@ static inline void uncore_disable_box(struct intel_uncore_box *box) static inline void uncore_enable_box(struct intel_uncore_box *box) { - uncore_box_init(box); - if (box->pmu->type->ops->enable_box) box->pmu->type->ops->enable_box(box); } @@ -298,6 +288,14 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box, return box->pmu->type->ops->read_counter(box, event); } +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} + static inline bool uncore_box_is_fake(struct intel_uncore_box *box) { return (box->phys_id < 0); -- cgit v1.2.1 From 9b47feb708e50e6114b4b4193eee67f2e5af9d05 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 8 Jun 2015 22:35:33 +0200 Subject: x86/asm/entry: Clean up entry*.S style, final bits A few bits were missed. Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 18 +++++++++--------- arch/x86/entry/entry_64_compat.S | 14 +++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index edd7aadfacfa..21dc60a60b5f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -12,7 +12,7 @@ * 0(%esp) - %ebx * 4(%esp) - %ecx * 8(%esp) - %edx - * C(%esp) - %esi + * C(%esp) - %esi * 10(%esp) - %edi * 14(%esp) - %ebp * 18(%esp) - %eax @@ -128,7 +128,7 @@ .macro POP_GS pop=0 98: popl %gs .if \pop <> 0 - add $\pop, %esp + add $\pop, %esp .endif .endm .macro POP_GS_EX @@ -487,8 +487,8 @@ ldt_ss: mov %esp, %edx /* load kernel esp */ mov PT_OLDESP(%esp), %eax /* load userspace esp */ mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ pushl $__ESPFIX_SS @@ -507,7 +507,7 @@ ENDPROC(entry_INT80_32) # perform work that needs to be done immediately before resumption ALIGN work_pending: - testb $_TIF_NEED_RESCHED, %cl + testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: call schedule @@ -520,7 +520,7 @@ work_resched: andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testb $_TIF_NEED_RESCHED, %cl jnz work_resched work_notifysig: # deal with pending signals and @@ -537,8 +537,8 @@ work_notifysig: # deal with pending signals and TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl jb resume_kernel xorl %edx, %edx call do_notify_resume @@ -609,7 +609,7 @@ END(sysenter_badsys) /* fixup the stack */ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ - shl $16, %eax + shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS pushl %eax diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 5757ddec35d6..2093ce6be203 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -106,8 +106,8 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysenter_tracesys sysenter_do_call: @@ -138,7 +138,7 @@ sysexit_from_sys_call: * This code path is still called 'sysexit' because it pairs * with 'sysenter' and it uses the SYSENTER calling convention. */ - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RIP(%rsp), %ecx /* User %eip */ RESTORE_RSI_RDI xorl %edx, %edx /* Do not leak kernel information */ @@ -229,7 +229,7 @@ sysexit_audit: #endif sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) popfq jmp sysenter_flags_fixed @@ -325,9 +325,9 @@ ENTRY(entry_SYSCALL_compat) 1: movl (%r8), %ebp _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys cstar_do_call: /* 32-bit syscall -> 64-bit C ABI argument conversion */ -- cgit v1.2.1 From 0c4109bec0a6cde471bef3a21cd6f8384a614469 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:00 -0700 Subject: x86/fpu/xstate: Fix up bad get_xsave_addr() assumptions get_xsave_addr() assumes that if an xsave bit is present in the hardware (pcntxt_mask) that it is present in a given xsave buffer. Due to an bug in the xsave code on all of the systems that have MPX (and thus all the users of this code), that has been a true assumption. But, the bug is getting fixed, so our assumption is not going to hold any more. It's quite possible (and normal) for an enabled state to be present on 'pcntxt_mask', but *not* in 'xstate_bv'. We need to consult 'xstate_bv'. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183700.1E739B34@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 45 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a580eb5c7e52..af3700e0dbd2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -382,19 +382,48 @@ void fpu__resume_cpu(void) * This is the API that is called to get xstate address in either * standard format or compacted format of xsave area. * + * Note that if there is no data for the field in the xsave buffer + * this will return NULL. + * * Inputs: - * xsave: base address of the xsave area; - * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, - * etc.) + * xstate: the thread's storage area for all FPU data + * xstate_feature: state which is defined in xsave.h (e.g. + * XSTATE_FP, XSTATE_SSE, etc...) * Output: - * address of the state in the xsave area. + * address of the state in the xsave area, or NULL if the + * field is not present in the xsave buffer. */ -void *get_xsave_addr(struct xregs_state *xsave, int xstate) +void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) { - int feature = fls64(xstate) - 1; - if (!test_bit(feature, (unsigned long *)&xfeatures_mask)) + int feature_nr = fls64(xstate_feature) - 1; + /* + * Do we even *have* xsave state? + */ + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return NULL; + + xsave = ¤t->thread.fpu.state.xsave; + /* + * We should not ever be requesting features that we + * have not enabled. Remember that pcntxt_mask is + * what we write to the XCR0 register. + */ + WARN_ONCE(!(xfeatures_mask & xstate_feature), + "get of unsupported state"); + /* + * This assumes the last 'xsave*' instruction to + * have requested that 'xstate_feature' be saved. + * If it did not, we might be seeing and old value + * of the field in the buffer. + * + * This can happen because the last 'xsave' did not + * request that this feature be saved (unlikely) + * or because the "init optimization" caused it + * to not be saved. + */ + if (!(xsave->header.xfeatures & xstate_feature)) return NULL; - return (void *)xsave + xstate_comp_offsets[feature]; + return (void *)xsave + xstate_comp_offsets[feature_nr]; } EXPORT_SYMBOL_GPL(get_xsave_addr); -- cgit v1.2.1 From 04cd027bcba1ead7bfe39e7f1c6f4d993c4c3323 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:00 -0700 Subject: x86/fpu/xstate: Wrap get_xsave_addr() to make it safer The MPX code appears is calling a low-level FPU function (copy_fpregs_to_fpstate()). This function is not able to be called in all contexts, although it is safe to call directly in some cases. Although probably correct, the current code is ugly and potentially error-prone. So, add a wrapper that calls the (slightly) higher-level fpu__save() (which is preempt- safe) and also ensures that we even *have* an FPU context (in the case that this was called when in lazy FPU mode). Ingo had this to say about the details about when we need preemption disabled: > it's indeed generally unsafe to access/copy FPU registers with preemption enabled, > for two reasons: > > - on older systems that use FSAVE the instruction destroys FPU register > contents, which has to be handled carefully > > - even on newer systems if we copy to FPU registers (which this code doesn't) > then we don't want a context switch to occur in the middle of it, because a > context switch will write to the fpstate, potentially overwriting our new data > with old FPU state. > > But it's safe to access FPU registers with preemption enabled in a couple of > special cases: > > - potentially destructively saving FPU registers: the signal handling code does > this in copy_fpstate_to_sigframe(), because it can rely on the signal restore > side to restore the original FPU state. > > - reading FPU registers on modern systems: we don't do this anywhere at the > moment, mostly to keep symmetry with older systems where FSAVE is > destructive. > > - initializing FPU registers on modern systems: fpu__clear() does this. Here > it's safe because we don't copy from the fpstate. > > - directly writing FPU registers from user-space memory (!). We do this in > fpu__restore_sig(), and it's safe because neither context switches nor > irq-handler FPU use can corrupt the source context of the copy (which is > user-space memory). > > Note that the MPX code's current use of copy_fpregs_to_fpstate() was safe I think, > because: > > - MPX is predicated on eagerfpu, so the destructive F[N]SAVE instruction won't be > used. > > - the code was only reading FPU registers, and was doing it only in places that > guaranteed that an FPU state was already active (i.e. didn't do it in > kthreads) Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Suresh Siddha Cc: bp@alien8.de Link: http://lkml.kernel.org/r/20150607183700.AA881696@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 1 + arch/x86/kernel/fpu/xstate.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 339894669117..4656b25bb9a7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -41,5 +41,6 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); void *get_xsave_addr(struct xregs_state *xsave, int xstate); +const void *get_xsave_field_ptr(int xstate_field); #endif diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index af3700e0dbd2..49d0d9b2a60a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -427,3 +427,35 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) return (void *)xsave + xstate_comp_offsets[feature_nr]; } EXPORT_SYMBOL_GPL(get_xsave_addr); + +/* + * This wraps up the common operations that need to occur when retrieving + * data from xsave state. It first ensures that the current task was + * using the FPU and retrieves the data in to a buffer. It then calculates + * the offset of the requested field in the buffer. + * + * This function is safe to call whether the FPU is in use or not. + * + * Note that this only works on the current task. + * + * Inputs: + * @xsave_state: state which is defined in xsave.h (e.g. XSTATE_FP, + * XSTATE_SSE, etc...) + * Output: + * address of the state in the xsave area or NULL if the state + * is not present or is in its 'init state'. + */ +const void *get_xsave_field_ptr(int xsave_state) +{ + struct fpu *fpu = ¤t->thread.fpu; + + if (!fpu->fpstate_active) + return NULL; + /* + * fpu__save() takes the CPU's xstate registers + * and saves them off to the 'fpu memory buffer. + */ + fpu__save(fpu); + + return get_xsave_addr(&fpu->state.xsave, xsave_state); +} -- cgit v1.2.1 From a84eeaa96b36a03188e1423349669c108d3a4bd7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:01 -0700 Subject: x86/mpx: Use the new get_xsave_field_ptr()API The MPX registers (bndcsr/bndcfgu/bndstatus) are not directly accessible via normal instructions. They essentially act as if they were floating point registers and are saved/restored along with those registers. There are two main paths in the MPX code where we care about the contents of these registers: 1. #BR (bounds) faults 2. the prctl() code where we are setting MPX up Both of those paths _might_ be called without the FPU having been used. That means that 'tsk->thread.fpu.state' might never be allocated. Also, fpu_save_init() is not preempt-safe. It was a bug to call it without disabling preemption. The new get_xsave_addr() calls unlazy_fpu() instead and properly disables preemption. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Suresh Siddha Cc: bp@alien8.de Link: http://lkml.kernel.org/r/20150607183701.BC0D37CF@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpx.h | 8 ++++---- arch/x86/kernel/traps.c | 17 ++++++++--------- arch/x86/mm/mpx.c | 30 +++++++++++++++--------------- 3 files changed, 27 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index f3c1b71d4fae..39f2d0ffe1e2 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -60,8 +60,8 @@ #ifdef CONFIG_X86_INTEL_MPX siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xregs_state *xsave_buf); -int mpx_handle_bd_fault(struct xregs_state *xsave_buf); + struct task_struct *tsk); +int mpx_handle_bd_fault(struct task_struct *tsk); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); @@ -78,11 +78,11 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xregs_state *xsave_buf) + struct task_struct *tsk) { return NULL; } -static inline int mpx_handle_bd_fault(struct xregs_state *xsave_buf) +static inline int mpx_handle_bd_fault(struct task_struct *tsk) { return -EINVAL; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a2510f230195..42f15314b361 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_64 @@ -371,9 +372,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - struct xregs_state *xsave_buf; enum ctx_state prev_state; - struct bndcsr *bndcsr; + const struct bndcsr *bndcsr; siginfo_t *info; prev_state = exception_enter(); @@ -392,12 +392,11 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) /* * We need to look at BNDSTATUS to resolve this exception. - * It is not directly accessible, though, so we need to - * do an xsave and then pull it out of the xsave buffer. + * A NULL here might mean that it is in its 'init state', + * which is all zeros which indicates MPX was not + * responsible for the exception. */ - copy_fpregs_to_fpstate(&tsk->thread.fpu); - xsave_buf = &(tsk->thread.fpu.state.xsave); - bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); if (!bndcsr) goto exit_trap; @@ -408,11 +407,11 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) */ switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { case 2: /* Bound directory has invalid entry. */ - if (mpx_handle_bd_fault(xsave_buf)) + if (mpx_handle_bd_fault(tsk)) goto exit_trap; break; /* Success, it was handled */ case 1: /* Bound violation. */ - info = mpx_generate_siginfo(regs, xsave_buf); + info = mpx_generate_siginfo(regs, tsk); if (IS_ERR(info)) { /* * We failed to decode the MPX instruction. Act as if diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 2e0dfd39bd22..9d67e230b4fb 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -272,9 +272,9 @@ bad_opcode: * The caller is expected to kfree() the returned siginfo_t. */ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xregs_state *xsave_buf) + struct task_struct *tsk) { - struct bndreg *bndregs, *bndreg; + const struct bndreg *bndregs, *bndreg; siginfo_t *info = NULL; struct insn insn; uint8_t bndregno; @@ -294,8 +294,8 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, err = -EINVAL; goto err_out; } - /* get the bndregs _area_ of the xsave structure */ - bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS); + /* get bndregs field from current task's xsave area */ + bndregs = get_xsave_field_ptr(XSTATE_BNDREGS); if (!bndregs) { err = -EINVAL; goto err_out; @@ -342,7 +342,7 @@ err_out: static __user void *task_get_bounds_dir(struct task_struct *tsk) { - struct bndcsr *bndcsr; + const struct bndcsr *bndcsr; if (!cpu_feature_enabled(X86_FEATURE_MPX)) return MPX_INVALID_BOUNDS_DIR; @@ -357,8 +357,7 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk) * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ - copy_fpregs_to_fpstate(&tsk->thread.fpu); - bndcsr = get_xsave_addr(&tsk->thread.fpu.state.xsave, XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; @@ -389,9 +388,10 @@ int mpx_enable_management(struct task_struct *tsk) * directory into XSAVE/XRSTOR Save Area and enable MPX through * XRSTOR instruction. * - * copy_xregs_to_kernel() is expected to be very expensive. Storing the bounds - * directory here means that we do not have to do xsave in the unmap - * path; we can just use mm->bd_addr instead. + * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is + * expected to be relatively expensive. Storing the bounds + * directory here means that we do not have to do xsave in the + * unmap path; we can just use mm->bd_addr instead. */ bd_base = task_get_bounds_dir(tsk); down_write(&mm->mmap_sem); @@ -497,12 +497,12 @@ out_unmap: * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, * and the size of each bound table is 4MB. */ -static int do_mpx_bt_fault(struct xregs_state *xsave_buf) +static int do_mpx_bt_fault(struct task_struct *tsk) { unsigned long bd_entry, bd_base; - struct bndcsr *bndcsr; + const struct bndcsr *bndcsr; - bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); if (!bndcsr) return -EINVAL; /* @@ -525,7 +525,7 @@ static int do_mpx_bt_fault(struct xregs_state *xsave_buf) return allocate_bt((long __user *)bd_entry); } -int mpx_handle_bd_fault(struct xregs_state *xsave_buf) +int mpx_handle_bd_fault(struct task_struct *tsk) { /* * Userspace never asked us to manage the bounds tables, @@ -534,7 +534,7 @@ int mpx_handle_bd_fault(struct xregs_state *xsave_buf) if (!kernel_managing_mpx_tables(current->mm)) return -EINVAL; - if (do_mpx_bt_fault(xsave_buf)) { + if (do_mpx_bt_fault(tsk)) { force_sig(SIGSEGV, current); /* * The force_sig() is essentially "handling" this -- cgit v1.2.1 From 46a6e0cf1c6665a8e867d8f7798d7a3538633f03 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:02 -0700 Subject: x86/mpx: Clean up the code by not passing a task pointer around when unnecessary The MPX code can only work on the current task. You can not, for instance, enable MPX management in another process or thread. You can also not handle a fault for another process or thread. Despite this, we pass a task_struct around prolifically. This patch removes all of the task struct passing for code paths where the code can not deal with another task (which turns out to be all of them). This has no functional changes. It's just a cleanup. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: bp@alien8.de Link: http://lkml.kernel.org/r/20150607183702.6A81DA2C@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpx.h | 10 ++++------ arch/x86/include/asm/processor.h | 12 ++++++------ arch/x86/kernel/traps.c | 5 ++--- arch/x86/mm/mpx.c | 19 +++++++++---------- 4 files changed, 21 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 39f2d0ffe1e2..0cdd16af71ad 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -59,9 +59,8 @@ MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT) #ifdef CONFIG_X86_INTEL_MPX -siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct task_struct *tsk); -int mpx_handle_bd_fault(struct task_struct *tsk); +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs); +int mpx_handle_bd_fault(void); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); @@ -77,12 +76,11 @@ static inline void mpx_mm_init(struct mm_struct *mm) void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); #else -static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct task_struct *tsk) +static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { return NULL; } -static inline int mpx_handle_bd_fault(struct task_struct *tsk) +static inline int mpx_handle_bd_fault(void) { return -EINVAL; } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8e04f51d6bea..53dbd2b4f1d8 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -802,18 +802,18 @@ extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); /* Register/unregister a process' MPX related resource */ -#define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk)) -#define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk)) +#define MPX_ENABLE_MANAGEMENT() mpx_enable_management() +#define MPX_DISABLE_MANAGEMENT() mpx_disable_management() #ifdef CONFIG_X86_INTEL_MPX -extern int mpx_enable_management(struct task_struct *tsk); -extern int mpx_disable_management(struct task_struct *tsk); +extern int mpx_enable_management(void); +extern int mpx_disable_management(void); #else -static inline int mpx_enable_management(struct task_struct *tsk) +static inline int mpx_enable_management(void) { return -EINVAL; } -static inline int mpx_disable_management(struct task_struct *tsk) +static inline int mpx_disable_management(void) { return -EINVAL; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 42f15314b361..cffff669be3f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -371,7 +371,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { - struct task_struct *tsk = current; enum ctx_state prev_state; const struct bndcsr *bndcsr; siginfo_t *info; @@ -407,11 +406,11 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) */ switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { case 2: /* Bound directory has invalid entry. */ - if (mpx_handle_bd_fault(tsk)) + if (mpx_handle_bd_fault()) goto exit_trap; break; /* Success, it was handled */ case 1: /* Bound violation. */ - info = mpx_generate_siginfo(regs, tsk); + info = mpx_generate_siginfo(regs); if (IS_ERR(info)) { /* * We failed to decode the MPX instruction. Act as if diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 9d67e230b4fb..47e4a8564012 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -271,8 +271,7 @@ bad_opcode: * * The caller is expected to kfree() the returned siginfo_t. */ -siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct task_struct *tsk) +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { const struct bndreg *bndregs, *bndreg; siginfo_t *info = NULL; @@ -340,7 +339,7 @@ err_out: return ERR_PTR(err); } -static __user void *task_get_bounds_dir(struct task_struct *tsk) +static __user void *mpx_get_bounds_dir(void) { const struct bndcsr *bndcsr; @@ -376,10 +375,10 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk) (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); } -int mpx_enable_management(struct task_struct *tsk) +int mpx_enable_management(void) { void __user *bd_base = MPX_INVALID_BOUNDS_DIR; - struct mm_struct *mm = tsk->mm; + struct mm_struct *mm = current->mm; int ret = 0; /* @@ -393,7 +392,7 @@ int mpx_enable_management(struct task_struct *tsk) * directory here means that we do not have to do xsave in the * unmap path; we can just use mm->bd_addr instead. */ - bd_base = task_get_bounds_dir(tsk); + bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); mm->bd_addr = bd_base; if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) @@ -403,7 +402,7 @@ int mpx_enable_management(struct task_struct *tsk) return ret; } -int mpx_disable_management(struct task_struct *tsk) +int mpx_disable_management(void) { struct mm_struct *mm = current->mm; @@ -497,7 +496,7 @@ out_unmap: * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, * and the size of each bound table is 4MB. */ -static int do_mpx_bt_fault(struct task_struct *tsk) +static int do_mpx_bt_fault(void) { unsigned long bd_entry, bd_base; const struct bndcsr *bndcsr; @@ -525,7 +524,7 @@ static int do_mpx_bt_fault(struct task_struct *tsk) return allocate_bt((long __user *)bd_entry); } -int mpx_handle_bd_fault(struct task_struct *tsk) +int mpx_handle_bd_fault(void) { /* * Userspace never asked us to manage the bounds tables, @@ -534,7 +533,7 @@ int mpx_handle_bd_fault(struct task_struct *tsk) if (!kernel_managing_mpx_tables(current->mm)) return -EINVAL; - if (do_mpx_bt_fault(tsk)) { + if (do_mpx_bt_fault()) { force_sig(SIGSEGV, current); /* * The force_sig() is essentially "handling" this -- cgit v1.2.1 From 3c1d32300920a446c67d697cd6b80f012ad06028 Mon Sep 17 00:00:00 2001 From: Qiaowei Ren Date: Sun, 7 Jun 2015 11:37:02 -0700 Subject: x86/mpx: Remove redundant MPX_BNDCFG_ADDR_MASK MPX_BNDCFG_ADDR_MASK is defined two times, so this patch removes redundant one. Signed-off-by: Qiaowei Ren Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183702.5F129376@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpx.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 0cdd16af71ad..871e5e5b0499 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -45,7 +45,6 @@ #define MPX_BNDSTA_TAIL 2 #define MPX_BNDCFG_TAIL 12 #define MPX_BNDSTA_ADDR_MASK (~((1UL< Date: Sun, 7 Jun 2015 11:37:02 -0700 Subject: x86/mpx: Restrict the mmap() size check to bounds tables The comment and code here are confusing. We do not currently allocate the bounds directory in the kernel. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183702.222CEC2A@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 47e4a8564012..d6e02f3adee0 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -46,8 +46,8 @@ static unsigned long mpx_mmap(unsigned long len) vm_flags_t vm_flags; struct vm_area_struct *vma; - /* Only bounds table and bounds directory can be allocated here */ - if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES) + /* Only bounds table can be allocated here */ + if (len != MPX_BT_SIZE_BYTES) return -EINVAL; down_write(&mm->mmap_sem); -- cgit v1.2.1 From 8c3641e957a948f41f0174290096ed7a3b95e703 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:02 -0700 Subject: x86/mpx: Introduce a boot-time disable flag MPX has the _potential_ to cause some issues. Say part of your init system tried to protect one of its components from buffer overflows with MPX. If there were a false positive, it's possible that MPX could keep a system from booting. MPX could also potentially cause performance issues since it is present in hot paths like the unmap path. Allow it to be disabled at boot time. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150607183702.2E8B77AB@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 401ccb03054e..3956858e5312 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -144,6 +144,22 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +static int __init x86_mpx_setup(char *s) +{ + /* require an exact match without trailing characters */ + if (strlen(s)) + return 0; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_MPX)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_MPX); + pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n"); + return 1; +} +__setup("nompx", x86_mpx_setup); + #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; -- cgit v1.2.1 From e7126cf5f10aef1555cb99eddb7efff41bdf9566 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:03 -0700 Subject: x86/mpx: Trace #BR exceptions This is the first in a series of MPX tracing patches. I've found these extremely useful in the process of debugging applications and the kernel code itself. This exception hooks in to the bounds (#BR) exception very early and allows capturing the key registers which would influence how the exception is handled. Note that bndcfgu/bndstatus are technically still 64-bit registers even in 32-bit mode. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183703.5FE2619A@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/mpx.h | 50 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 2 ++ arch/x86/mm/mpx.c | 3 +++ 3 files changed, 55 insertions(+) create mode 100644 arch/x86/include/asm/trace/mpx.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h new file mode 100644 index 000000000000..5c03ec8a90d6 --- /dev/null +++ b/arch/x86/include/asm/trace/mpx.h @@ -0,0 +1,50 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mpx + +#if !defined(_TRACE_MPX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MPX_H + +#include + +#ifdef CONFIG_X86_INTEL_MPX + +TRACE_EVENT(bounds_exception_mpx, + + TP_PROTO(const struct bndcsr *bndcsr), + TP_ARGS(bndcsr), + + TP_STRUCT__entry( + __field(u64, bndcfgu) + __field(u64, bndstatus) + ), + + TP_fast_assign( + /* need to get rid of the 'const' on bndcsr */ + __entry->bndcfgu = (u64)bndcsr->bndcfgu; + __entry->bndstatus = (u64)bndcsr->bndstatus; + ), + + TP_printk("bndcfgu:0x%llx bndstatus:0x%llx", + __entry->bndcfgu, + __entry->bndstatus) +); + +#else + +/* + * This gets used outside of MPX-specific code, so we need a stub. + */ +static inline void trace_bounds_exception_mpx(const struct bndcsr *bndcsr) +{ +} + +#endif /* CONFIG_X86_INTEL_MPX */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE mpx +#endif /* _TRACE_MPX_H */ + +/* This part must be outside protection */ +#include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index cffff669be3f..36cb15b7b367 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_64 @@ -399,6 +400,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) if (!bndcsr) goto exit_trap; + trace_bounds_exception_mpx(bndcsr); /* * The error code field of the BNDSTATUS register communicates status * information of a bound range exception #BR or operation involving diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index d6e02f3adee0..1fef52c17fc8 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -17,6 +17,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + static const char *mpx_mapping_name(struct vm_area_struct *vma) { return "[mpx]"; -- cgit v1.2.1 From 97efebf1bc30a80122af3295ebdb726dbc040ca6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:03 -0700 Subject: x86/mpx: Trace entry to bounds exception paths There are two basic things that can happen as the result of a bounds exception (#BR): 1. We allocate a new bounds table 2. We pass up a bounds exception to userspace. This patch adds a trace point for the case where we are passing the exception up to userspace with a signal. We are also explicit that we're printing out the inverse of the 'upper' that we encounter. If you want to filter, for instance, you need to ~ the value first. The reason we do this is because of how 'upper' is stored in the bounds table. If a pointer's range is: 0x1000 -> 0x2000 it is stored in the bounds table as (32-bits here for brevity): lower: 0x00001000 upper: 0xffffdfff That is so that an all 0's entry: lower: 0x00000000 upper: 0x00000000 corresponds to the "init" bounds which store a *range* of: 0x00000000 -> 0xffffffff That is, by far, the common case, and that lets us use the zero page, or deduplicate the memory, etc... The 'upper' stored in the table is gibberish to print by itself, so we print ~upper to get the *actual*, logical, human-readable value printed out. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183703.027BB9B0@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/mpx.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/mpx.c | 1 + 2 files changed, 35 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h index 5c03ec8a90d6..5c3af06a2ae1 100644 --- a/arch/x86/include/asm/trace/mpx.h +++ b/arch/x86/include/asm/trace/mpx.h @@ -8,6 +8,40 @@ #ifdef CONFIG_X86_INTEL_MPX +TRACE_EVENT(mpx_bounds_register_exception, + + TP_PROTO(void *addr_referenced, + const struct bndreg *bndreg), + TP_ARGS(addr_referenced, bndreg), + + TP_STRUCT__entry( + __field(void *, addr_referenced) + __field(u64, lower_bound) + __field(u64, upper_bound) + ), + + TP_fast_assign( + __entry->addr_referenced = addr_referenced; + __entry->lower_bound = bndreg->lower_bound; + __entry->upper_bound = bndreg->upper_bound; + ), + /* + * Note that we are printing out the '~' of the upper + * bounds register here. It is actually stored in its + * one's complement form so that its 'init' state + * corresponds to all 0's. But, that looks like + * gibberish when printed out, so print out the 1's + * complement instead of the actual value here. Note + * though that you still need to specify filters for the + * actual value, not the displayed one. + */ + TP_printk("address referenced: 0x%p bounds: lower: 0x%llx ~upper: 0x%llx", + __entry->addr_referenced, + __entry->lower_bound, + ~__entry->upper_bound + ) +); + TRACE_EVENT(bounds_exception_mpx, TP_PROTO(const struct bndcsr *bndcsr), diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1fef52c17fc8..75e5d7043f65 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -335,6 +335,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) err = -EINVAL; goto err_out; } + trace_mpx_bounds_register_exception(info->si_addr, bndreg); return info; err_out: /* info might be NULL, but kfree() handles that */ -- cgit v1.2.1 From 2a1dcb1f796ad37028df37d96fc7c5b6b1705a43 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:03 -0700 Subject: x86/mpx: Trace the attempts to find bounds tables There are two different events being traced here. They are doing similar things so share a trace "EVENT_CLASS" and are presented together. 1. Trace when MPX is zapping pages "mpx_unmap_zap": When MPX can not free an entire bounds table, it will instead try to zap unused parts of a bounds table to free the backing memory. This decreases RSS (resident set size) without decreasing the virtual space allocated for bounds tables. 2. Trace attempts to find bounds tables "mpx_unmap_search": This event traces any time we go looking to unmap a bounds table for a given virtual address range. This is useful to ensure that the kernel actually "tried" to free a bounds table versus times it succeeded in finding one. It might try and fail if it realized that a table was shared with an adjacent VMA which is not being unmapped. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183703.B9D2468B@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/mpx.h | 32 ++++++++++++++++++++++++++++++++ arch/x86/mm/mpx.c | 2 ++ 2 files changed, 34 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h index 5c3af06a2ae1..c13c6fa3fa92 100644 --- a/arch/x86/include/asm/trace/mpx.h +++ b/arch/x86/include/asm/trace/mpx.h @@ -63,6 +63,38 @@ TRACE_EVENT(bounds_exception_mpx, __entry->bndstatus) ); +DECLARE_EVENT_CLASS(mpx_range_trace, + + TP_PROTO(unsigned long start, + unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("[0x%p:0x%p]", + (void *)__entry->start, + (void *)__entry->end + ) +); + +DEFINE_EVENT(mpx_range_trace, mpx_unmap_zap, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end) +); + +DEFINE_EVENT(mpx_range_trace, mpx_unmap_search, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end) +); + #else /* diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 75e5d7043f65..55729ee9263e 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -668,6 +668,7 @@ static int zap_bt_entries(struct mm_struct *mm, len = min(vma->vm_end, end) - addr; zap_page_range(vma, addr, len, NULL); + trace_mpx_unmap_zap(addr, addr+len); vma = vma->vm_next; addr = vma->vm_start; @@ -840,6 +841,7 @@ static int mpx_unmap_tables(struct mm_struct *mm, long __user *bd_entry, *bde_start, *bde_end; unsigned long bt_addr; + trace_mpx_unmap_search(start, end); /* * "Edge" bounds tables are those which are being used by the region * (start -> end), but that may be shared with adjacent areas. If they -- cgit v1.2.1 From cd4996dce18b619bd7b3acf75c91f49c77f05a97 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:04 -0700 Subject: x86/mpx: Trace allocation of new bounds tables Bounds tables are a significant consumer of memory. It is important to know when they are being allocated. Add a trace point to trace whenever an allocation occurs and also its virtual address. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183704.EC23A93E@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/mpx.h | 16 ++++++++++++++++ arch/x86/mm/mpx.c | 1 + 2 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h index c13c6fa3fa92..173dd3ba108c 100644 --- a/arch/x86/include/asm/trace/mpx.h +++ b/arch/x86/include/asm/trace/mpx.h @@ -95,6 +95,22 @@ DEFINE_EVENT(mpx_range_trace, mpx_unmap_search, TP_ARGS(start, end) ); +TRACE_EVENT(mpx_new_bounds_table, + + TP_PROTO(unsigned long table_vaddr), + TP_ARGS(table_vaddr), + + TP_STRUCT__entry( + __field(unsigned long, table_vaddr) + ), + + TP_fast_assign( + __entry->table_vaddr = table_vaddr; + ), + + TP_printk("table vaddr:%p", (void *)__entry->table_vaddr) +); + #else /* diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 55729ee9263e..c17fd27579af 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -483,6 +483,7 @@ static int allocate_bt(long __user *bd_entry) ret = -EINVAL; goto out_unmap; } + trace_mpx_new_bounds_table(bt_addr); return 0; out_unmap: vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES); -- cgit v1.2.1 From b0e9b09b3bd64e67bba862e238d3757b2482b6de Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:04 -0700 Subject: x86: Make is_64bit_mm() widely available The uprobes code has a nice helper, is_64bit_mm(), that consults both the runtime and compile-time flags for 32-bit support. Instead of reinventing the wheel, pull it in to an x86 header so we can use it for MPX. I prefer passing the 'mm' around to test_thread_flag(TIF_IA32) because it makes it explicit where the context is coming from. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183704.F0209999@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 13 +++++++++++++ arch/x86/kernel/uprobes.c | 10 +--------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 883f6b933fa4..5e8daee7c5c9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -142,6 +142,19 @@ static inline void arch_exit_mmap(struct mm_struct *mm) paravirt_arch_exit_mmap(mm); } +#ifdef CONFIG_X86_64 +static inline bool is_64bit_mm(struct mm_struct *mm) +{ + return !config_enabled(CONFIG_IA32_EMULATION) || + !(mm->context.ia32_compat == TIF_IA32); +} +#else +static inline bool is_64bit_mm(struct mm_struct *mm) +{ + return false; +} +#endif + static inline void arch_bprm_mm_init(struct mm_struct *mm, struct vm_area_struct *vma) { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0b81ad67da07..66476244731e 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -29,6 +29,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -312,11 +313,6 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool } #ifdef CONFIG_X86_64 -static inline bool is_64bit_mm(struct mm_struct *mm) -{ - return !config_enabled(CONFIG_IA32_EMULATION) || - !(mm->context.ia32_compat == TIF_IA32); -} /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses @@ -497,10 +493,6 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) } } #else /* 32-bit: */ -static inline bool is_64bit_mm(struct mm_struct *mm) -{ - return false; -} /* * No RIP-relative addressing on 32-bit */ -- cgit v1.2.1 From a1149fc83a1f97612e72ec24a0bdbabff7b85e77 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:04 -0700 Subject: x86/mpx: Add temporary variable to reduce masking When we allocate a bounds table, we call mmap(), then add a "valid" bit to the value before storing it in to the bounds directory. If we fail along the way, we go and mask that valid bit _back_ out. That seems a little silly, and this makes it much more clear when we have a plain address versus an actual table _entry_. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183704.3D69D5F4@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index c17fd27579af..4f7fb7c233cc 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -429,6 +429,7 @@ static int allocate_bt(long __user *bd_entry) unsigned long expected_old_val = 0; unsigned long actual_old_val = 0; unsigned long bt_addr; + unsigned long bd_new_entry; int ret = 0; /* @@ -441,7 +442,7 @@ static int allocate_bt(long __user *bd_entry) /* * Set the valid flag (kinda like _PAGE_PRESENT in a pte) */ - bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG; /* * Go poke the address of the new bounds table in to the @@ -455,7 +456,7 @@ static int allocate_bt(long __user *bd_entry) * of the MPX code that have to pagefault_disable(). */ ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, - expected_old_val, bt_addr); + expected_old_val, bd_new_entry); if (ret) goto out_unmap; @@ -486,7 +487,7 @@ static int allocate_bt(long __user *bd_entry) trace_mpx_new_bounds_table(bt_addr); return 0; out_unmap: - vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES); + vm_munmap(bt_addr, MPX_BT_SIZE_BYTES); return ret; } -- cgit v1.2.1 From 54587653904c552c56b9dec153d7a89063394b09 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:04 -0700 Subject: x86/mpx: Introduce new 'directory entry' to 'addr' helper function Currently, to get from a bounds directory entry to the virtual address of a bounds table, we simply mask off a few low bits. However, the set of bits we mask off is different for 32-bit and 64-bit binaries. This breaks the operation out in to a helper function and also adds a temporary variable to store the result until we are sure we are returning one. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183704.007686CE@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpx.h | 1 - arch/x86/mm/mpx.c | 41 ++++++++++++++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 871e5e5b0499..99d374eb7b08 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -45,7 +45,6 @@ #define MPX_BNDSTA_TAIL 2 #define MPX_BNDCFG_TAIL 12 #define MPX_BNDSTA_ADDR_MASK (~((1UL< Date: Sun, 7 Jun 2015 11:37:05 -0700 Subject: x86/mpx: Use 32-bit-only cmpxchg() for 32-bit apps user_atomic_cmpxchg_inatomic() actually looks at sizeof(*ptr) to figure out how many bytes to copy. If we run it on a 64-bit kernel with a 64-bit pointer, it will copy a 64-bit bounds directory entry. That's fine, except when we have 32-bit programs with 32-bit bounds directory entries and we only *want* 32-bits. This patch breaks the cmpxchg() operation out in to its own function and performs the 32-bit type swizzling in there. Note, the "64-bit" version of this code _would_ work on a 32-bit-only kernel. The issue this patch addresses is only for when the kernel's 'long' is mismatched from the size of the bounds directory entry of the process we are working on. The new helper modifies 'actual_old_val' or returns an error. But gcc doesn't know this, so it warns about 'actual_old_val' being unused. Shut it up with an uninitialized_var(). Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183705.672B115E@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 8cc793479039..294ea2092ef5 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -419,6 +419,35 @@ int mpx_disable_management(void) return 0; } +static int mpx_cmpxchg_bd_entry(struct mm_struct *mm, + unsigned long *curval, + unsigned long __user *addr, + unsigned long old_val, unsigned long new_val) +{ + int ret; + /* + * user_atomic_cmpxchg_inatomic() actually uses sizeof() + * the pointer that we pass to it to figure out how much + * data to cmpxchg. We have to be careful here not to + * pass a pointer to a 64-bit data type when we only want + * a 32-bit copy. + */ + if (is_64bit_mm(mm)) { + ret = user_atomic_cmpxchg_inatomic(curval, + addr, old_val, new_val); + } else { + u32 uninitialized_var(curval_32); + u32 old_val_32 = old_val; + u32 new_val_32 = new_val; + u32 __user *addr_32 = (u32 __user *)addr; + + ret = user_atomic_cmpxchg_inatomic(&curval_32, + addr_32, old_val_32, new_val_32); + *curval = curval_32; + } + return ret; +} + /* * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB, @@ -426,6 +455,7 @@ int mpx_disable_management(void) */ static int allocate_bt(long __user *bd_entry) { + struct mm_struct *mm = current->mm; unsigned long expected_old_val = 0; unsigned long actual_old_val = 0; unsigned long bt_addr; @@ -455,8 +485,8 @@ static int allocate_bt(long __user *bd_entry) * mmap_sem at this point, unlike some of the other part * of the MPX code that have to pagefault_disable(). */ - ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, - expected_old_val, bd_new_entry); + ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, bd_entry, + expected_old_val, bd_new_entry); if (ret) goto out_unmap; @@ -710,15 +740,16 @@ static int unmap_single_bt(struct mm_struct *mm, long __user *bd_entry, unsigned long bt_addr) { unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; - unsigned long actual_old_val = 0; + unsigned long uninitialized_var(actual_old_val); int ret; while (1) { int need_write = 1; + unsigned long cleared_bd_entry = 0; pagefault_disable(); - ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, - expected_old_val, 0); + ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, + bd_entry, expected_old_val, cleared_bd_entry); pagefault_enable(); if (!ret) break; -- cgit v1.2.1 From 613fcb7d3c79ec25b5913a6aa974c9047c31e68c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:05 -0700 Subject: x86/mpx: Support 32-bit binaries on 64-bit kernels Right now, the kernel can only switch between 64-bit and 32-bit binaries at compile time. This patch adds support for 32-bit binaries on 64-bit kernels when we support ia32 emulation. We essentially choose which set of table sizes to use when doing arithmetic for the bounds table calculations. This also uses a different approach for calculating the table indexes than before. I think the new one makes it much more clear what is going on, and allows us to share more code between the 32-bit and 64-bit cases. Based-on-patch-by: Qiaowei Ren Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183705.E01F21E2@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpx.h | 62 ++++++++--------- arch/x86/mm/mpx.c | 170 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 179 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 99d374eb7b08..7a35495275a9 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -13,49 +13,47 @@ #define MPX_BNDCFG_ENABLE_FLAG 0x1 #define MPX_BD_ENTRY_VALID_FLAG 0x1 -#ifdef CONFIG_X86_64 - -/* upper 28 bits [47:20] of the virtual address in 64-bit used to - * index into bounds directory (BD). - */ -#define MPX_BD_ENTRY_OFFSET 28 -#define MPX_BD_ENTRY_SHIFT 3 -/* bits [19:3] of the virtual address in 64-bit used to index into - * bounds table (BT). +/* + * The upper 28 bits [47:20] of the virtual address in 64-bit + * are used to index into bounds directory (BD). + * + * The directory is 2G (2^31) in size, and with 8-byte entries + * it has 2^28 entries. */ -#define MPX_BT_ENTRY_OFFSET 17 -#define MPX_BT_ENTRY_SHIFT 5 -#define MPX_IGN_BITS 3 -#define MPX_BD_ENTRY_TAIL 3 +#define MPX_BD_SIZE_BYTES_64 (1UL<<31) +#define MPX_BD_ENTRY_BYTES_64 8 +#define MPX_BD_NR_ENTRIES_64 (MPX_BD_SIZE_BYTES_64/MPX_BD_ENTRY_BYTES_64) -#else - -#define MPX_BD_ENTRY_OFFSET 20 -#define MPX_BD_ENTRY_SHIFT 2 -#define MPX_BT_ENTRY_OFFSET 10 -#define MPX_BT_ENTRY_SHIFT 4 -#define MPX_IGN_BITS 2 -#define MPX_BD_ENTRY_TAIL 2 +/* + * The 32-bit directory is 4MB (2^22) in size, and with 4-byte + * entries it has 2^20 entries. + */ +#define MPX_BD_SIZE_BYTES_32 (1UL<<22) +#define MPX_BD_ENTRY_BYTES_32 4 +#define MPX_BD_NR_ENTRIES_32 (MPX_BD_SIZE_BYTES_32/MPX_BD_ENTRY_BYTES_32) -#endif +/* + * A 64-bit table is 4MB total in size, and an entry is + * 4 64-bit pointers in size. + */ +#define MPX_BT_SIZE_BYTES_64 (1UL<<22) +#define MPX_BT_ENTRY_BYTES_64 32 +#define MPX_BT_NR_ENTRIES_64 (MPX_BT_SIZE_BYTES_64/MPX_BT_ENTRY_BYTES_64) -#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT)) -#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT)) +/* + * A 32-bit table is 16kB total in size, and an entry is + * 4 32-bit pointers in size. + */ +#define MPX_BT_SIZE_BYTES_32 (1UL<<14) +#define MPX_BT_ENTRY_BYTES_32 16 +#define MPX_BT_NR_ENTRIES_32 (MPX_BT_SIZE_BYTES_32/MPX_BT_ENTRY_BYTES_32) #define MPX_BNDSTA_TAIL 2 #define MPX_BNDCFG_TAIL 12 #define MPX_BNDSTA_ADDR_MASK (~((1UL<>(MPX_BT_ENTRY_OFFSET+ \ - MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT) -#define MPX_GET_BT_ENTRY_OFFSET(addr) ((((addr)>>MPX_IGN_BITS) & \ - MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT) - #ifdef CONFIG_X86_INTEL_MPX siginfo_t *mpx_generate_siginfo(struct pt_regs *regs); int mpx_handle_bd_fault(void); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 294ea2092ef5..e323ef65c9ff 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -34,6 +34,22 @@ static int is_mpx_vma(struct vm_area_struct *vma) return (vma->vm_ops == &mpx_vma_ops); } +static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm) +{ + if (is_64bit_mm(mm)) + return MPX_BD_SIZE_BYTES_64; + else + return MPX_BD_SIZE_BYTES_32; +} + +static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) +{ + if (is_64bit_mm(mm)) + return MPX_BT_SIZE_BYTES_64; + else + return MPX_BT_SIZE_BYTES_32; +} + /* * This is really a simplified "vm_mmap". it only handles MPX * bounds tables (the bounds directory is user-allocated). @@ -50,7 +66,7 @@ static unsigned long mpx_mmap(unsigned long len) struct vm_area_struct *vma; /* Only bounds table can be allocated here */ - if (len != MPX_BT_SIZE_BYTES) + if (len != mpx_bt_size_bytes(mm)) return -EINVAL; down_write(&mm->mmap_sem); @@ -449,13 +465,12 @@ static int mpx_cmpxchg_bd_entry(struct mm_struct *mm, } /* - * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each - * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB, + * With 32-bit mode, a bounds directory is 4MB, and the size of each + * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB, * and the size of each bounds table is 4MB. */ -static int allocate_bt(long __user *bd_entry) +static int allocate_bt(struct mm_struct *mm, long __user *bd_entry) { - struct mm_struct *mm = current->mm; unsigned long expected_old_val = 0; unsigned long actual_old_val = 0; unsigned long bt_addr; @@ -466,7 +481,7 @@ static int allocate_bt(long __user *bd_entry) * Carve the virtual space out of userspace for the new * bounds table: */ - bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES); + bt_addr = mpx_mmap(mpx_bt_size_bytes(mm)); if (IS_ERR((void *)bt_addr)) return PTR_ERR((void *)bt_addr); /* @@ -517,7 +532,7 @@ static int allocate_bt(long __user *bd_entry) trace_mpx_new_bounds_table(bt_addr); return 0; out_unmap: - vm_munmap(bt_addr, MPX_BT_SIZE_BYTES); + vm_munmap(bt_addr, mpx_bt_size_bytes(mm)); return ret; } @@ -536,6 +551,7 @@ static int do_mpx_bt_fault(void) { unsigned long bd_entry, bd_base; const struct bndcsr *bndcsr; + struct mm_struct *mm = current->mm; bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); if (!bndcsr) @@ -554,10 +570,10 @@ static int do_mpx_bt_fault(void) * the directory is. */ if ((bd_entry < bd_base) || - (bd_entry >= bd_base + MPX_BD_SIZE_BYTES)) + (bd_entry >= bd_base + mpx_bd_size_bytes(mm))) return -EINVAL; - return allocate_bt((long __user *)bd_entry); + return allocate_bt(mm, (long __user *)bd_entry); } int mpx_handle_bd_fault(void) @@ -789,7 +805,115 @@ static int unmap_single_bt(struct mm_struct *mm, * avoid recursion, do_munmap() will check whether it comes * from one bounds table through VM_MPX flag. */ - return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES); + return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm)); +} + +static inline int bt_entry_size_bytes(struct mm_struct *mm) +{ + if (is_64bit_mm(mm)) + return MPX_BT_ENTRY_BYTES_64; + else + return MPX_BT_ENTRY_BYTES_32; +} + +/* + * Take a virtual address and turns it in to the offset in bytes + * inside of the bounds table where the bounds table entry + * controlling 'addr' can be found. + */ +static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm, + unsigned long addr) +{ + unsigned long bt_table_nr_entries; + unsigned long offset = addr; + + if (is_64bit_mm(mm)) { + /* Bottom 3 bits are ignored on 64-bit */ + offset >>= 3; + bt_table_nr_entries = MPX_BT_NR_ENTRIES_64; + } else { + /* Bottom 2 bits are ignored on 32-bit */ + offset >>= 2; + bt_table_nr_entries = MPX_BT_NR_ENTRIES_32; + } + /* + * We know the size of the table in to which we are + * indexing, and we have eliminated all the low bits + * which are ignored for indexing. + * + * Mask out all the high bits which we do not need + * to index in to the table. Note that the tables + * are always powers of two so this gives us a proper + * mask. + */ + offset &= (bt_table_nr_entries-1); + /* + * We now have an entry offset in terms of *entries* in + * the table. We need to scale it back up to bytes. + */ + offset *= bt_entry_size_bytes(mm); + return offset; +} + +/* + * How much virtual address space does a single bounds + * directory entry cover? + * + * Note, we need a long long because 4GB doesn't fit in + * to a long on 32-bit. + */ +static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) +{ + unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits); + if (is_64bit_mm(mm)) + return virt_space / MPX_BD_NR_ENTRIES_64; + else + return virt_space / MPX_BD_NR_ENTRIES_32; +} + +/* + * Return an offset in terms of bytes in to the bounds + * directory where the bounds directory entry for a given + * virtual address resides. + * + * This has to be in bytes because the directory entries + * are different sizes on 64/32 bit. + */ +static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm, + unsigned long addr) +{ + /* + * There are several ways to derive the bd offsets. We + * use the following approach here: + * 1. We know the size of the virtual address space + * 2. We know the number of entries in a bounds table + * 3. We know that each entry covers a fixed amount of + * virtual address space. + * So, we can just divide the virtual address by the + * virtual space used by one entry to determine which + * entry "controls" the given virtual address. + */ + if (is_64bit_mm(mm)) { + int bd_entry_size = 8; /* 64-bit pointer */ + /* + * Take the 64-bit addressing hole in to account. + */ + addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1); + return (addr / bd_entry_virt_space(mm)) * bd_entry_size; + } else { + int bd_entry_size = 4; /* 32-bit pointer */ + /* + * 32-bit has no hole so this case needs no mask + */ + return (addr / bd_entry_virt_space(mm)) * bd_entry_size; + } + /* + * The two return calls above are exact copies. If we + * pull out a single copy and put it in here, gcc won't + * realize that we're doing a power-of-2 divide and use + * shifts. It uses a real divide. If we put them up + * there, it manages to figure it out (gcc 4.8.3). + */ } /* @@ -803,6 +927,7 @@ static int unmap_shared_bt(struct mm_struct *mm, unsigned long end, bool prev_shared, bool next_shared) { unsigned long bt_addr; + unsigned long start_off, end_off; int ret; ret = get_bt_addr(mm, bd_entry, &bt_addr); @@ -814,17 +939,20 @@ static int unmap_shared_bt(struct mm_struct *mm, if (ret) return ret; + start_off = mpx_get_bt_entry_offset_bytes(mm, start); + end_off = mpx_get_bt_entry_offset_bytes(mm, end); + if (prev_shared && next_shared) ret = zap_bt_entries(mm, bt_addr, - bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), - bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + bt_addr + start_off, + bt_addr + end_off); else if (prev_shared) ret = zap_bt_entries(mm, bt_addr, - bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), - bt_addr+MPX_BT_SIZE_BYTES); + bt_addr + start_off, + bt_addr + mpx_bt_size_bytes(mm)); else if (next_shared) ret = zap_bt_entries(mm, bt_addr, bt_addr, - bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + bt_addr + end_off); else ret = unmap_single_bt(mm, bd_entry, bt_addr); @@ -845,8 +973,8 @@ static int unmap_edge_bts(struct mm_struct *mm, struct vm_area_struct *prev, *next; bool prev_shared = false, next_shared = false; - bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); - bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + bde_start = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); + bde_end = mm->bd_addr + mpx_get_bd_entry_offset(mm, end-1); /* * Check whether bde_start and bde_end are shared with adjacent @@ -858,10 +986,10 @@ static int unmap_edge_bts(struct mm_struct *mm, * in to 'next'. */ next = find_vma_prev(mm, start, &prev); - if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1)) + if (prev && (mm->bd_addr + mpx_get_bd_entry_offset(mm, prev->vm_end-1)) == bde_start) prev_shared = true; - if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start)) + if (next && (mm->bd_addr + mpx_get_bd_entry_offset(mm, next->vm_start)) == bde_end) next_shared = true; @@ -927,8 +1055,8 @@ static int mpx_unmap_tables(struct mm_struct *mm, * 1. fully covered * 2. not at the edges of the mapping, even if full aligned */ - bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); - bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + bde_start = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); + bde_end = mm->bd_addr + mpx_get_bd_entry_offset(mm, end-1); for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) { ret = get_bt_addr(mm, bd_entry, &bt_addr); switch (ret) { -- cgit v1.2.1 From 3ceaccdf92073d193f0bfbe24280dd736e3fed86 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:06 -0700 Subject: x86/mpx: Rewrite the unmap code The MPX code needs to clear out bounds tables for memory which is no longer in use. We do this when a userspace mapping is torn down (unmapped). There are two modes: 1. An entire bounds table becomes unused, and can be freed and its pointer removed from the bounds directory. This happens either when a large mapping is torn down, or when a small mapping is torn down and it is the last mapping "covered" by a bounds table. 2. Only part of a bounds table becomes unused, in which case we free the backing memory as if MADV_DONTNEED was called. The old code was a spaghetti mess of "edge" bounds tables where the edges were handled specially, even if we were unmapping an entire one. Non-edge bounds tables are always fully unmapped, but share a different code path from the edge ones. The old code had a bug where it was unmapping too much memory. I worked on fixing it for two days and gave up. I didn't write the original code. I didn't particularly like it, but it worked, so I left it. After my debug session, I realized it was undebuggagle *and* buggy, so out it went. I also wrote a new unmapping test program which uncovers bugs pretty nicely. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183706.DCAEC67D@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 411 ++++++++++++++++++++++-------------------------------- 1 file changed, 168 insertions(+), 243 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e323ef65c9ff..18fcf737bdbd 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -704,110 +704,6 @@ static int get_bt_addr(struct mm_struct *mm, return 0; } -/* - * Free the backing physical pages of bounds table 'bt_addr'. - * Assume start...end is within that bounds table. - */ -static int zap_bt_entries(struct mm_struct *mm, - unsigned long bt_addr, - unsigned long start, unsigned long end) -{ - struct vm_area_struct *vma; - unsigned long addr, len; - - /* - * Find the first overlapping vma. If vma->vm_start > start, there - * will be a hole in the bounds table. This -EINVAL return will - * cause a SIGSEGV. - */ - vma = find_vma(mm, start); - if (!vma || vma->vm_start > start) - return -EINVAL; - - /* - * A NUMA policy on a VM_MPX VMA could cause this bouds table to - * be split. So we need to look across the entire 'start -> end' - * range of this bounds table, find all of the VM_MPX VMAs, and - * zap only those. - */ - addr = start; - while (vma && vma->vm_start < end) { - /* - * We followed a bounds directory entry down - * here. If we find a non-MPX VMA, that's bad, - * so stop immediately and return an error. This - * probably results in a SIGSEGV. - */ - if (!is_mpx_vma(vma)) - return -EINVAL; - - len = min(vma->vm_end, end) - addr; - zap_page_range(vma, addr, len, NULL); - trace_mpx_unmap_zap(addr, addr+len); - - vma = vma->vm_next; - addr = vma->vm_start; - } - - return 0; -} - -static int unmap_single_bt(struct mm_struct *mm, - long __user *bd_entry, unsigned long bt_addr) -{ - unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; - unsigned long uninitialized_var(actual_old_val); - int ret; - - while (1) { - int need_write = 1; - unsigned long cleared_bd_entry = 0; - - pagefault_disable(); - ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, - bd_entry, expected_old_val, cleared_bd_entry); - pagefault_enable(); - if (!ret) - break; - if (ret == -EFAULT) - ret = mpx_resolve_fault(bd_entry, need_write); - /* - * If we could not resolve the fault, consider it - * userspace's fault and error out. - */ - if (ret) - return ret; - } - /* - * The cmpxchg was performed, check the results. - */ - if (actual_old_val != expected_old_val) { - /* - * Someone else raced with us to unmap the table. - * There was no bounds table pointed to by the - * directory, so declare success. Somebody freed - * it. - */ - if (!actual_old_val) - return 0; - /* - * Something messed with the bounds directory - * entry. We hold mmap_sem for read or write - * here, so it could not be a _new_ bounds table - * that someone just allocated. Something is - * wrong, so pass up the error and SIGSEGV. - */ - return -EINVAL; - } - - /* - * Note, we are likely being called under do_munmap() already. To - * avoid recursion, do_munmap() will check whether it comes - * from one bounds table through VM_MPX flag. - */ - return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm)); -} - static inline int bt_entry_size_bytes(struct mm_struct *mm) { if (is_64bit_mm(mm)) @@ -872,13 +768,69 @@ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) } /* - * Return an offset in terms of bytes in to the bounds - * directory where the bounds directory entry for a given - * virtual address resides. - * - * This has to be in bytes because the directory entries - * are different sizes on 64/32 bit. + * Free the backing physical pages of bounds table 'bt_addr'. + * Assume start...end is within that bounds table. */ +static noinline int zap_bt_entries_mapping(struct mm_struct *mm, + unsigned long bt_addr, + unsigned long start_mapping, unsigned long end_mapping) +{ + struct vm_area_struct *vma; + unsigned long addr, len; + unsigned long start; + unsigned long end; + + /* + * if we 'end' on a boundary, the offset will be 0 which + * is not what we want. Back it up a byte to get the + * last bt entry. Then once we have the entry itself, + * move 'end' back up by the table entry size. + */ + start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping); + end = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1); + /* + * Move end back up by one entry. Among other things + * this ensures that it remains page-aligned and does + * not screw up zap_page_range() + */ + end += bt_entry_size_bytes(mm); + + /* + * Find the first overlapping vma. If vma->vm_start > start, there + * will be a hole in the bounds table. This -EINVAL return will + * cause a SIGSEGV. + */ + vma = find_vma(mm, start); + if (!vma || vma->vm_start > start) + return -EINVAL; + + /* + * A NUMA policy on a VM_MPX VMA could cause this bounds table to + * be split. So we need to look across the entire 'start -> end' + * range of this bounds table, find all of the VM_MPX VMAs, and + * zap only those. + */ + addr = start; + while (vma && vma->vm_start < end) { + /* + * We followed a bounds directory entry down + * here. If we find a non-MPX VMA, that's bad, + * so stop immediately and return an error. This + * probably results in a SIGSEGV. + */ + if (!is_mpx_vma(vma)) + return -EINVAL; + + len = min(vma->vm_end, end) - addr; + zap_page_range(vma, addr, len, NULL); + trace_mpx_unmap_zap(addr, addr+len); + + vma = vma->vm_next; + addr = vma->vm_start; + } + return 0; +} + static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm, unsigned long addr) { @@ -916,69 +868,80 @@ static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm, */ } -/* - * If the bounds table pointed by bounds directory 'bd_entry' is - * not shared, unmap this whole bounds table. Otherwise, only free - * those backing physical pages of bounds table entries covered - * in this virtual address region start...end. - */ -static int unmap_shared_bt(struct mm_struct *mm, - long __user *bd_entry, unsigned long start, - unsigned long end, bool prev_shared, bool next_shared) +static int unmap_entire_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long bt_addr) { - unsigned long bt_addr; - unsigned long start_off, end_off; + unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + unsigned long uninitialized_var(actual_old_val); int ret; - ret = get_bt_addr(mm, bd_entry, &bt_addr); + while (1) { + int need_write = 1; + unsigned long cleared_bd_entry = 0; + + pagefault_disable(); + ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, + bd_entry, expected_old_val, cleared_bd_entry); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } /* - * We could see an "error" ret for not-present bounds - * tables (not really an error), or actual errors, but - * stop unmapping either way. + * The cmpxchg was performed, check the results. */ - if (ret) - return ret; - - start_off = mpx_get_bt_entry_offset_bytes(mm, start); - end_off = mpx_get_bt_entry_offset_bytes(mm, end); - - if (prev_shared && next_shared) - ret = zap_bt_entries(mm, bt_addr, - bt_addr + start_off, - bt_addr + end_off); - else if (prev_shared) - ret = zap_bt_entries(mm, bt_addr, - bt_addr + start_off, - bt_addr + mpx_bt_size_bytes(mm)); - else if (next_shared) - ret = zap_bt_entries(mm, bt_addr, bt_addr, - bt_addr + end_off); - else - ret = unmap_single_bt(mm, bd_entry, bt_addr); - - return ret; + if (actual_old_val != expected_old_val) { + /* + * Someone else raced with us to unmap the table. + * That is OK, since we were both trying to do + * the same thing. Declare success. + */ + if (!actual_old_val) + return 0; + /* + * Something messed with the bounds directory + * entry. We hold mmap_sem for read or write + * here, so it could not be a _new_ bounds table + * that someone just allocated. Something is + * wrong, so pass up the error and SIGSEGV. + */ + return -EINVAL; + } + /* + * Note, we are likely being called under do_munmap() already. To + * avoid recursion, do_munmap() will check whether it comes + * from one bounds table through VM_MPX flag. + */ + return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm)); } -/* - * A virtual address region being munmap()ed might share bounds table - * with adjacent VMAs. We only need to free the backing physical - * memory of these shared bounds tables entries covered in this virtual - * address region. - */ -static int unmap_edge_bts(struct mm_struct *mm, - unsigned long start, unsigned long end) +static int try_unmap_single_bt(struct mm_struct *mm, + unsigned long start, unsigned long end) { + struct vm_area_struct *next; + struct vm_area_struct *prev; + /* + * "bta" == Bounds Table Area: the area controlled by the + * bounds table that we are unmapping. + */ + unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1); + unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm); + unsigned long uninitialized_var(bt_addr); + void __user *bde_vaddr; int ret; - long __user *bde_start, *bde_end; - struct vm_area_struct *prev, *next; - bool prev_shared = false, next_shared = false; - - bde_start = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); - bde_end = mm->bd_addr + mpx_get_bd_entry_offset(mm, end-1); - /* - * Check whether bde_start and bde_end are shared with adjacent - * VMAs. + * We know 'start' and 'end' lie within an area controlled + * by a single bounds table. See if there are any other + * VMAs controlled by that bounds table. If there are not + * then we can "expand" the are we are unmapping to possibly + * cover the entire table. * * We already unliked the VMAs from the mm's rbtree so 'start' * is guaranteed to be in a hole. This gets us the first VMA @@ -986,102 +949,64 @@ static int unmap_edge_bts(struct mm_struct *mm, * in to 'next'. */ next = find_vma_prev(mm, start, &prev); - if (prev && (mm->bd_addr + mpx_get_bd_entry_offset(mm, prev->vm_end-1)) - == bde_start) - prev_shared = true; - if (next && (mm->bd_addr + mpx_get_bd_entry_offset(mm, next->vm_start)) - == bde_end) - next_shared = true; - - /* - * This virtual address region being munmap()ed is only - * covered by one bounds table. - * - * In this case, if this table is also shared with adjacent - * VMAs, only part of the backing physical memory of the bounds - * table need be freeed. Otherwise the whole bounds table need - * be unmapped. - */ - if (bde_start == bde_end) { - return unmap_shared_bt(mm, bde_start, start, end, - prev_shared, next_shared); + if ((!prev || prev->vm_end <= bta_start_vaddr) && + (!next || next->vm_start >= bta_end_vaddr)) { + /* + * No neighbor VMAs controlled by same bounds + * table. Try to unmap the whole thing + */ + start = bta_start_vaddr; + end = bta_end_vaddr; } + bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); + ret = get_bt_addr(mm, bde_vaddr, &bt_addr); /* - * If more than one bounds tables are covered in this virtual - * address region being munmap()ed, we need to separately check - * whether bde_start and bde_end are shared with adjacent VMAs. + * No bounds table there, so nothing to unmap. */ - ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false); - if (ret) - return ret; - ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared); + if (ret == -ENOENT) { + ret = 0; + return 0; + } if (ret) return ret; - - return 0; + /* + * We are unmapping an entire table. Either because the + * unmap that started this whole process was large enough + * to cover an entire table, or that the unmap was small + * but was the area covered by a bounds table. + */ + if ((start == bta_start_vaddr) && + (end == bta_end_vaddr)) + return unmap_entire_bt(mm, bde_vaddr, bt_addr); + return zap_bt_entries_mapping(mm, bt_addr, start, end); } static int mpx_unmap_tables(struct mm_struct *mm, unsigned long start, unsigned long end) { - int ret; - long __user *bd_entry, *bde_start, *bde_end; - unsigned long bt_addr; - + unsigned long one_unmap_start; trace_mpx_unmap_search(start, end); - /* - * "Edge" bounds tables are those which are being used by the region - * (start -> end), but that may be shared with adjacent areas. If they - * turn out to be completely unshared, they will be freed. If they are - * shared, we will free the backing store (like an MADV_DONTNEED) for - * areas used by this region. - */ - ret = unmap_edge_bts(mm, start, end); - switch (ret) { - /* non-present tables are OK */ - case 0: - case -ENOENT: - /* Success, or no tables to unmap */ - break; - case -EINVAL: - case -EFAULT: - default: - return ret; - } - - /* - * Only unmap the bounds table that are - * 1. fully covered - * 2. not at the edges of the mapping, even if full aligned - */ - bde_start = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); - bde_end = mm->bd_addr + mpx_get_bd_entry_offset(mm, end-1); - for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) { - ret = get_bt_addr(mm, bd_entry, &bt_addr); - switch (ret) { - case 0: - break; - case -ENOENT: - /* No table here, try the next one */ - continue; - case -EINVAL: - case -EFAULT: - default: - /* - * Note: we are being strict here. - * Any time we run in to an issue - * unmapping tables, we stop and - * SIGSEGV. - */ - return ret; - } - ret = unmap_single_bt(mm, bd_entry, bt_addr); + one_unmap_start = start; + while (one_unmap_start < end) { + int ret; + unsigned long next_unmap_start = ALIGN(one_unmap_start+1, + bd_entry_virt_space(mm)); + unsigned long one_unmap_end = end; + /* + * if the end is beyond the current bounds table, + * move it back so we only deal with a single one + * at a time + */ + if (one_unmap_end > next_unmap_start) + one_unmap_end = next_unmap_start; + ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end); if (ret) return ret; - } + one_unmap_start = next_unmap_start; + } return 0; } -- cgit v1.2.1 From bea03c50b871a2fa922f31ad7c9993bb4fc7b192 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:06 -0700 Subject: x86/mpx: Do not count MPX VMAs as neighbors when unmapping The comment pretty much says it all. I wrote a test program that does lots of random allocations and forces bounds tables to be created. It came up with a layout like this: .... | BOUNDS DIRECTORY ENTRY COVERS | .... | BOUNDS TABLE COVERS | | BOUNDS TABLE | REAL ALLOC | BOUNDS TABLE | Unmapping "REAL ALLOC" should have been able to free the bounds table "covering" the "REAL ALLOC" because it was the last real user. But, the neighboring VMA bounds tables were found, considered as real neighbors, and we declined to free the bounds table covering the area. Doing this over and over left a small but significant number of these orphans. Handling them is fairly straighforward. All we have to do is walk the VMAs and skip all of the MPX ones when looking for neighbors. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183706.A6BD90BF@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 18fcf737bdbd..6233d5195f01 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -936,17 +936,31 @@ static int try_unmap_single_bt(struct mm_struct *mm, unsigned long uninitialized_var(bt_addr); void __user *bde_vaddr; int ret; + /* + * We already unlinked the VMAs from the mm's rbtree so 'start' + * is guaranteed to be in a hole. This gets us the first VMA + * before the hole in to 'prev' and the next VMA after the hole + * in to 'next'. + */ + next = find_vma_prev(mm, start, &prev); + /* + * Do not count other MPX bounds table VMAs as neighbors. + * Although theoretically possible, we do not allow bounds + * tables for bounds tables so our heads do not explode. + * If we count them as neighbors here, we may end up with + * lots of tables even though we have no actual table + * entries in use. + */ + while (next && is_mpx_vma(next)) + next = next->vm_next; + while (prev && is_mpx_vma(prev)) + prev = prev->vm_prev; /* * We know 'start' and 'end' lie within an area controlled * by a single bounds table. See if there are any other * VMAs controlled by that bounds table. If there are not * then we can "expand" the are we are unmapping to possibly * cover the entire table. - * - * We already unliked the VMAs from the mm's rbtree so 'start' - * is guaranteed to be in a hole. This gets us the first VMA - * before the hole in to 'prev' and the next VMA after the hole - * in to 'next'. */ next = find_vma_prev(mm, start, &prev); if ((!prev || prev->vm_end <= bta_start_vaddr) && -- cgit v1.2.1 From 97ac46a5087eaf87fd76ff7bb31ec9c896010442 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 7 Jun 2015 11:37:06 -0700 Subject: x86/mpx: Allow 32-bit binaries on 64-bit kernels again Now that the bugs in mixed mode MPX handling are fixed, re-allow 32-bit binaries on 64-bit kernels again. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150607183706.70277DAD@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 6233d5195f01..7a657f58bbea 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -366,12 +366,6 @@ static __user void *mpx_get_bounds_dir(void) if (!cpu_feature_enabled(X86_FEATURE_MPX)) return MPX_INVALID_BOUNDS_DIR; - /* - * 32-bit binaries on 64-bit kernels are currently - * unsupported. - */ - if (IS_ENABLED(CONFIG_X86_64) && test_thread_flag(TIF_IA32)) - return MPX_INVALID_BOUNDS_DIR; /* * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. -- cgit v1.2.1 From 1dace0116d0b05c967d94644fc4dfe96be2ecd3d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 9 Jun 2015 18:54:07 -0500 Subject: x86/PCI: Use host bridge _CRS info on Foxconn K8M890-8237A The Foxconn K8M890-8237A has two PCI host bridges, and we can't assign resources correctly without the information from _CRS that tells us which address ranges are claimed by which bridge. In the bugs mentioned below, we incorrectly assign a sound card address (this example is from 1033299): bus: 00 index 2 [mem 0x80000000-0xfcffffffff] ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-7f]) pci_root PNP0A08:00: host bridge window [mem 0x80000000-0xbfefffff] (ignored) pci_root PNP0A08:00: host bridge window [mem 0xc0000000-0xdfffffff] (ignored) pci_root PNP0A08:00: host bridge window [mem 0xf0000000-0xfebfffff] (ignored) ACPI: PCI Root Bridge [PCI1] (domain 0000 [bus 80-ff]) pci_root PNP0A08:01: host bridge window [mem 0xbff00000-0xbfffffff] (ignored) pci 0000:80:01.0: [1106:3288] type 0 class 0x000403 pci 0000:80:01.0: reg 10: [mem 0xbfffc000-0xbfffffff 64bit] pci 0000:80:01.0: address space collision: [mem 0xbfffc000-0xbfffffff 64bit] conflicts with PCI Bus #00 [mem 0x80000000-0xfcffffffff] pci 0000:80:01.0: BAR 0: assigned [mem 0xfd00000000-0xfd00003fff 64bit] BUG: unable to handle kernel paging request at ffffc90000378000 IP: [] azx_create+0x37c/0x822 [snd_hda_intel] We assigned 0xfd_0000_0000, but that is not in any of the host bridge windows, and the sound card doesn't work. Turn on pci=use_crs automatically for this system. Link: https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/931368 Link: https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/1033299 Signed-off-by: Bjorn Helgaas CC: stable@vger.kernel.org --- arch/x86/pci/acpi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index e4695985f9de..d8e225826489 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -81,6 +81,17 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), }, }, + /* https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/931368 */ + /* https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/1033299 */ + { + .callback = set_use_crs, + .ident = "Foxconn K8M890-8237A", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Foxconn"), + DMI_MATCH(DMI_BOARD_NAME, "K8M890-8237A"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), + }, + }, /* Now for the blacklist.. */ -- cgit v1.2.1 From aee4b013a71666f11ffeac11ab45bb7c6e0e394d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 9 Jun 2015 20:54:07 +0200 Subject: x86/asm/entry/32: Fix fallout from the R9 trick removal in the SYSCALL code I put %ebp restoration code too late. Under strace, it is not reached and %ebp is not restored upon return to userspace. This is the fix. Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433876051-26604-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 2093ce6be203..2c44180d2fac 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -344,6 +344,7 @@ cstar_dispatch: call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: + movl RCX(%rsp), %ebp DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -351,7 +352,6 @@ cstar_dispatch: sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RCX(%rsp), %ebp RESTORE_RSI_RDI_RDX movl RIP(%rsp), %ecx movl EFLAGS(%rsp), %r11d -- cgit v1.2.1 From 1536bb46fac7672ef04aaaa6a3b07848314263bc Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 9 Jun 2015 20:54:08 +0200 Subject: x86/asm/entry/32: Explain reloading of registers after __audit_syscall_entry() Here it is not obvious why we load pt_regs->cx to %esi etc. Lets improve comments. Explain that here we combine two things: first, we reload registers since some of them are clobbered by the C function we just called; and we also convert 32-bit syscall params to 64-bit C ABI, because we are going to jump back to syscall dispatch code. Move reloading of 6th argument into the macro instead of having it after each of two macro invocations. No actual code changes here. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433876051-26604-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 2c44180d2fac..0fa108cdf182 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -185,12 +185,18 @@ sysexit_from_sys_call: movl %ebx, %esi /* 2nd arg: 1st syscall arg */ movl %eax, %edi /* 1st arg: syscall number */ call __audit_syscall_entry - movl ORIG_RAX(%rsp), %eax /* reload syscall number */ - movl %ebx, %edi /* reload 1st syscall arg */ - movl RCX(%rsp), %esi /* reload 2nd syscall arg */ - movl RDX(%rsp), %edx /* reload 3rd syscall arg */ - movl RSI(%rsp), %ecx /* reload 4th syscall arg */ - movl RDI(%rsp), %r8d /* reload 5th syscall arg */ + /* + * We are going to jump back to syscall dispatch. + * Prepare syscall args as required by 64-bit C ABI. + * Clobbered registers are loaded from pt_regs on stack. + */ + movl ORIG_RAX(%rsp), %eax /* syscall number */ + movl %ebx, %edi /* arg1 */ + movl RCX(%rsp), %esi /* arg2 */ + movl RDX(%rsp), %edx /* arg3 */ + movl RSI(%rsp), %ecx /* arg4 */ + movl RDI(%rsp), %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ .endm .macro auditsys_exit exit @@ -221,7 +227,6 @@ sysexit_from_sys_call: sysenter_auditsys: auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ jmp sysenter_dispatch sysexit_audit: @@ -379,7 +384,6 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: -- cgit v1.2.1 From a92fde25231a89d7d10895482556260c1b63767d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 9 Jun 2015 20:54:09 +0200 Subject: x86/asm/entry/32: Shorten __audit_syscall_entry() args preparation We use three MOVs to swap edx and ecx. We can use one XCHG instead. Expand the comments. It's difficult to keep track which arg# every register corresponds to, so spell it out. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433876051-26604-3-git-send-email-dvlasenk@redhat.com [ Expanded the comments some more. ] Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 0fa108cdf182..bb187a6a877c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -178,17 +178,26 @@ sysexit_from_sys_call: #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common - movl %esi, %r8d /* 5th arg: 4th syscall arg */ - movl %ecx, %r9d /* swap with edx */ - movl %edx, %ecx /* 4th arg: 3rd syscall arg */ - movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ - movl %ebx, %esi /* 2nd arg: 1st syscall arg */ - movl %eax, %edi /* 1st arg: syscall number */ + /* + * At this point, registers hold syscall args in the 32-bit syscall ABI: + * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. + * + * We want to pass them to __audit_syscall_entry(), which is a 64-bit + * C function with 5 parameters, so shuffle them to match what + * the function expects: RDI,RSI,RDX,RCX,R8. + */ + movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ + xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ + /* arg3 (RDX) <= 2nd syscall arg (ECX) */ + movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ + movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ call __audit_syscall_entry + /* - * We are going to jump back to syscall dispatch. - * Prepare syscall args as required by 64-bit C ABI. - * Clobbered registers are loaded from pt_regs on stack. + * We are going to jump back to the syscall dispatch code. + * Prepare syscall args as required by the 64-bit C ABI. + * Registers clobbered by __audit_syscall_entry() are + * loaded from pt_regs on stack: */ movl ORIG_RAX(%rsp), %eax /* syscall number */ movl %ebx, %edi /* arg1 */ -- cgit v1.2.1 From 539f5113650068ba221197f190267ab727296ef5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Jun 2015 12:36:01 -0700 Subject: x86/asm/entry/64: Disentangle error_entry/exit gsbase/ebx/usermode code The error_entry/error_exit code to handle gsbase and whether we return to user mdoe was a mess: - error_sti was misnamed. In particular, it did not enable interrupts. - Error handling for gs_change was hopelessly tangled the normal usermode path. Separate it out. This saves a branch in normal entries from kernel mode. - The comments were bad. Fix it up. As a nice side effect, there's now a code path that happens on error entries from user mode. We'll use it soon. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f1be898ab93360169fb845ab85185948832209ee.1433878454.git.luto@kernel.org [ Prettified it, clarified comments some more. ] Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index bd97161f90cb..3bb2c4302df1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1135,7 +1135,7 @@ END(paranoid_exit) /* * Save all registers in pt_regs, and switch gs if needed. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) cld @@ -1144,9 +1144,11 @@ ENTRY(error_entry) xorl %ebx, %ebx testb $3, CS+8(%rsp) jz error_kernelspace -error_swapgs: + + /* We entered from user mode */ SWAPGS -error_sti: + +error_entry_done: TRACE_IRQS_OFF ret @@ -1165,8 +1167,15 @@ error_kernelspace: cmpq %rax, RIP+8(%rsp) je bstep_iret cmpq $gs_change, RIP+8(%rsp) - je error_swapgs - jmp error_sti + jne error_entry_done + + /* + * hack: gs_change can fail with user gsbase. If this happens, fix up + * gsbase and proceed. We'll fix up the exception and land in + * gs_change's error handler with kernel gsbase. + */ + SWAPGS + jmp error_entry_done bstep_iret: /* Fix truncated RIP */ @@ -1174,16 +1183,30 @@ bstep_iret: /* fall through */ error_bad_iret: + /* + * We came from an IRET to user mode, so we have user gsbase. + * Switch to kernel gsbase: + */ SWAPGS + + /* + * Pretend that the exception came from user mode: set up pt_regs + * as if we faulted immediately after IRET and clear EBX so that + * error_exit knows that we will be returning to user mode. + */ mov %rsp, %rdi call fixup_bad_iret mov %rax, %rsp - decl %ebx /* Return to usergs */ - jmp error_sti + decl %ebx + jmp error_entry_done END(error_entry) -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +/* + * On entry, EBS is a "return to kernel mode" flag: + * 1: already in kernel mode, don't need SWAPGS + * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode + */ ENTRY(error_exit) movl %ebx, %eax RESTORE_EXTRA_REGS -- cgit v1.2.1 From 5ec45a192fe6e287f0fc06d5ca4f3bd446d94803 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 10 Jun 2015 11:15:02 -0700 Subject: arch/x86/kvm/mmu.c: work around gcc-4.4.4 bug Fix this compile issue with gcc-4.4.4: arch/x86/kvm/mmu.c: In function 'kvm_mmu_pte_write': arch/x86/kvm/mmu.c:4256: error: unknown field 'cr0_wp' specified in initializer arch/x86/kvm/mmu.c:4257: error: unknown field 'cr4_pae' specified in initializer arch/x86/kvm/mmu.c:4257: warning: excess elements in union initializer ... gcc-4.4.4 (at least) has issues when using anonymous unions in initializers. Fixes: edc90b7dc4ceef6 ("KVM: MMU: fix SMAP virtualization") Cc: Xiao Guangrong Cc: Paolo Bonzini Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kvm/mmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 44a7d2515497..b73337634214 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4215,13 +4215,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush, zap_page; - union kvm_mmu_page_role mask = (union kvm_mmu_page_role) { - .cr0_wp = 1, - .cr4_pae = 1, - .nxe = 1, - .smep_andnot_wp = 1, - .smap_andnot_wp = 1, - }; + union kvm_mmu_page_role mask = { }; + + mask.cr0_wp = 1; + mask.cr4_pae = 1; + mask.nxe = 1; + mask.smep_andnot_wp = 1; + mask.smap_andnot_wp = 1; /* * If we don't have indirect shadow pages, it means no page is -- cgit v1.2.1 From 186dfc9d69b96a38ec6ec654127dba4432184494 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 10 Jun 2015 17:49:41 +0200 Subject: x86/swiotlb: Try coherent allocations with __GFP_NOWARN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we boot a kdump kernel in high memory, there is by default only 72MB of low memory available. The swiotlb code takes 64MB of it (by default) so that there are only 8MB left to allocate from. On systems with many devices this causes page allocator warnings from dma_generic_alloc_coherent(): systemd-udevd: page allocation failure: order:0, mode:0x280d4 CPU: 0 PID: 197 Comm: systemd-udevd Tainted: G W 3.12.28-4-default #1 Hardware name: HP ProLiant DL980 G7, BIOS P66 07/30/2012 ffff8800781335e0 ffffffff8150b1db 00000000000280d4 ffffffff8113af90 0000000000000000 0000000000000000 ffff88007efdbb00 0000000100000000 0000000000000000 0000000000000000 0000000000000000 0000000000000001 Call Trace: dump_trace+0x7d/0x2d0 show_stack_log_lvl+0x94/0x170 show_stack+0x21/0x50 dump_stack+0x41/0x51 warn_alloc_failed+0xf0/0x160 __alloc_pages_slowpath+0x72f/0x796 __alloc_pages_nodemask+0x1ea/0x210 dma_generic_alloc_coherent+0x96/0x140 x86_swiotlb_alloc_coherent+0x1c/0x50 ttm_dma_pool_alloc_new_pages+0xab/0x320 [ttm] ttm_dma_populate+0x3ce/0x640 [ttm] ttm_tt_bind+0x36/0x60 [ttm] ttm_bo_handle_move_mem+0x55f/0x5c0 [ttm] ttm_bo_move_buffer+0x105/0x130 [ttm] ttm_bo_validate+0xc1/0x130 [ttm] ttm_bo_init+0x24b/0x400 [ttm] radeon_bo_create+0x16c/0x200 [radeon] radeon_ring_init+0x11e/0x2b0 [radeon] r100_cp_init+0x123/0x5b0 [radeon] r100_startup+0x194/0x230 [radeon] r100_init+0x223/0x410 [radeon] radeon_device_init+0x6af/0x830 [radeon] radeon_driver_load_kms+0x89/0x180 [radeon] drm_get_pci_dev+0x121/0x2f0 [drm] local_pci_probe+0x39/0x60 pci_device_probe+0xa9/0x120 driver_probe_device+0x9d/0x3d0 __driver_attach+0x8b/0x90 bus_for_each_dev+0x5b/0x90 bus_add_driver+0x1f8/0x2c0 driver_register+0x5b/0xe0 do_one_initcall+0xf2/0x1a0 load_module+0x1207/0x1c70 SYSC_finit_module+0x75/0xa0 system_call_fastpath+0x16/0x1b 0x7fac533d2788 After these warnings the code enters a fall-back path and allocated directly from the swiotlb aperture in the end. So remove these warnings as this is not a fatal error. Signed-off-by: Joerg Roedel [ Simplify, reflow comment. ] Signed-off-by: Borislav Petkov Acked-by: Baoquan He Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jörg Rödel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: kexec@lists.infradead.org Link: http://lkml.kernel.org/r/1433500202-25531-3-git-send-email-joro@8bytes.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 77dd0ad58be4..adf0392d549a 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -20,6 +20,13 @@ void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, { void *vaddr; + /* + * Don't print a warning when the first allocation attempt fails. + * swiotlb_alloc_coherent() will print a warning when the DMA + * memory allocation ultimately failed. + */ + flags |= __GFP_NOWARN; + vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags, attrs); if (vaddr) -- cgit v1.2.1 From 94fb9334182284e8e7e4bcb9125c25dc33af19d4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 10 Jun 2015 17:49:42 +0200 Subject: x86/crash: Allocate enough low memory when crashkernel=high MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the crash kernel is loaded above 4GiB in memory, the first kernel allocates only 72MiB of low-memory for the DMA requirements of the second kernel. On systems with many devices this is not enough and causes device driver initialization errors and failed crash dumps. Testing by SUSE and Redhat has shown that 256MiB is a good default value for now and the discussion has lead to this value as well. So set this default value to 256MiB to make sure there is enough memory available for DMA. Signed-off-by: Joerg Roedel [ Reflow comment. ] Signed-off-by: Borislav Petkov Acked-by: Baoquan He Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jörg Rödel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: kexec@lists.infradead.org Link: http://lkml.kernel.org/r/1433500202-25531-4-git-send-email-joro@8bytes.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d74ac33290ae..cba828892790 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -531,12 +531,14 @@ static void __init reserve_crashkernel_low(void) if (ret != 0) { /* * two parts from lib/swiotlb.c: - * swiotlb size: user specified with swiotlb= or default. - * swiotlb overflow buffer: now is hardcoded to 32k. - * We round it to 8M for other buffers that - * may need to stay low too. + * -swiotlb size: user-specified with swiotlb= or default. + * + * -swiotlb overflow buffer: now hardcoded to 32k. We round it + * to 8M for other buffers that may need to stay low too. Also + * make sure we allocate enough extra low memory so that we + * don't run out of DMA buffers for 32-bit devices. */ - low_size = swiotlb_size_or_default() + (8UL<<20); + low_size = max(swiotlb_size_or_default() + (8UL<<20), 256UL<<20); auto_set = true; } else { /* passed with crashkernel=0,low ? */ -- cgit v1.2.1 From a8424003679e90b9952e20adcd1ff1560d9dd3e9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 11 Jun 2015 12:34:00 -0700 Subject: x86/fpu: Fix double-increment in setup_xstate_features() I noticed that my MPX tracepoints were producing garbage for the lower and upper bounds: mpx_bounds_register_exception: address referenced: 0x00007fffffffccb7 bounds: lower: 0x0 ~upper: 0xffffffffffffffff mpx_bounds_register_exception: address referenced: 0x00007fffffffccbf bounds: lower: 0x0 ~upper: 0xffffffffffffffff This is, of course, bogus because 0x00007fffffffccbf is *within* the bounds. I assumed that my instruction decoder was bad and went looking at it. But I eventually realized that I was getting a '0' offset back from xstate_offsets[BNDREGS]. It was being skipped in the initialization, which is obviously bogus, so remove the extra leaf++. This also goes an initializes xstate_offsets/sizes[] to -1 so so that bugs like this will oops instead of silently failing in interesting ways. This was introduced by: 39f1acd ("x86/fpu/xstate: Don't assume the first zero xfeatures zero bit means the end") Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Link: http://lkml.kernel.org/r/20150611193400.2E0B00DB@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 49d0d9b2a60a..62fc001c7846 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -31,7 +31,8 @@ static const char *xfeature_names[] = */ u64 xfeatures_mask __read_mostly; -static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX]; +static unsigned int xstate_offsets[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1}; +static unsigned int xstate_sizes[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; /* The number of supported xfeatures in xfeatures_mask: */ @@ -187,7 +188,6 @@ static void __init setup_xstate_features(void) xstate_sizes[leaf] = eax; printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax); - leaf++; } } -- cgit v1.2.1 From 6f281923949a4944a7d5625e4f900ec02fefee59 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 9 Jun 2015 13:20:28 +0800 Subject: iommu: Add new member capability to struct irq_remap_ops Add a new member 'capability' to struct irq_remap_ops for storing information about available capabilities such as VT-d Posted-Interrupts. Signed-off-by: Feng Wu Reviewed-by: Jiang Liu Reviewed-by: Thomas Gleixner Acked-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1433827237-3382-2-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 78974fbc33b4..8b432dd8f170 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -29,6 +29,10 @@ struct msi_msg; struct irq_alloc_info; +enum irq_remap_cap { + IRQ_POSTING_CAP = 0, +}; + #ifdef CONFIG_IRQ_REMAP extern void set_irq_remapping_broken(void); -- cgit v1.2.1 From 8541186faf3b59623c86022cc5f21ce9bd1332c5 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 9 Jun 2015 13:20:31 +0800 Subject: iommu, x86: Implement irq_set_vcpu_affinity for intel_ir_chip Interrupt chip callback to set the VCPU affinity for posted interrupts. [ tglx: Use the helper function to copy from the remap irte instead of open coding it. Massage the comment as well ] Signed-off-by: Feng Wu Reviewed-by: Jiang Liu Reviewed-by: Thomas Gleixner Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: joro@8bytes.org Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1433827237-3382-5-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 8b432dd8f170..e479fbda55cf 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -57,6 +57,11 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void) return x86_vector_domain; } +struct vcpu_data { + u64 pi_desc_addr; /* Physical address of PI Descriptor */ + u32 vector; /* Guest vector of the interrupt */ +}; + #else /* CONFIG_IRQ_REMAP */ static inline void set_irq_remapping_broken(void) { } -- cgit v1.2.1 From 959c870f7305be019d9316bc4e038dc6119d51ad Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 9 Jun 2015 13:20:36 +0800 Subject: iommu, x86: Provide irq_remapping_cap() interface Add a new interface irq_remapping_cap() to detect whether irq remapping supports new features, such as VT-d Posted-Interrupts. Export the function, so that KVM code can check this and use this mechanism properly. Signed-off-by: Feng Wu Reviewed-by: Jiang Liu Reviewed-by: Thomas Gleixner Acked-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1433827237-3382-10-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index e479fbda55cf..046c7fb1ca43 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -35,6 +35,7 @@ enum irq_remap_cap { #ifdef CONFIG_IRQ_REMAP +extern bool irq_remapping_cap(enum irq_remap_cap cap); extern void set_irq_remapping_broken(void); extern int irq_remapping_prepare(void); extern int irq_remapping_enable(void); @@ -64,6 +65,7 @@ struct vcpu_data { #else /* CONFIG_IRQ_REMAP */ +static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } static inline void set_irq_remapping_broken(void) { } static inline int irq_remapping_prepare(void) { return -ENODEV; } static inline int irq_remapping_enable(void) { return -ENODEV; } -- cgit v1.2.1 From de1e00871d1e6ad803850b9eb1d4d32305187c26 Mon Sep 17 00:00:00 2001 From: Jeremiah Mahler Date: Fri, 12 Jun 2015 21:56:40 -0700 Subject: crypto: aesni - fix crypto_fpu_exit() section mismatch The '__init aesni_init()' function calls the '__exit crypto_fpu_exit()' function directly. Since they are in different sections, this generates a warning. make CONFIG_DEBUG_SECTION_MISMATCH=y ... WARNING: arch/x86/crypto/aesni-intel.o(.init.text+0x12b): Section mismatch in reference from the function init_module() to the function .exit.text:crypto_fpu_exit() The function __init init_module() references a function __exit crypto_fpu_exit(). This is often seen when error handling in the init function uses functionality in the exit path. The fix is often to remove the __exit annotation of crypto_fpu_exit() so it may be used outside an exit section. Fix the warning by removing the __exit annotation. Signed-off-by: Jeremiah Mahler Signed-off-by: Herbert Xu --- arch/x86/crypto/fpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index f368ba261739..321e97979662 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -156,7 +156,7 @@ int __init crypto_fpu_init(void) return crypto_register_template(&crypto_fpu_tmpl); } -void __exit crypto_fpu_exit(void) +void crypto_fpu_exit(void) { crypto_unregister_template(&crypto_fpu_tmpl); } -- cgit v1.2.1 From b6ac069532218027f2991cba01d7a72a200688b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 5 Jun 2015 20:57:41 +0200 Subject: KVM: x86: fix lapic.timer_mode on restore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lapic.timer_mode was not properly initialized after migration, which broke few useful things, like login, by making every sleep eternal. Fix this by calling apic_update_lvtt in kvm_apic_post_state_restore. There are other slowpaths that update lvtt, so this patch makes sure something similar doesn't happen again by calling apic_update_lvtt after every modification. Cc: stable@vger.kernel.org Fixes: f30ebc312ca9 ("KVM: x86: optimize some accesses to LVTT and SPIV") Signed-off-by: Radim Krčmář Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 629af0f1c5c4..4c7deb4f78a1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1090,6 +1090,17 @@ static void update_divide_count(struct kvm_lapic *apic) apic->divide_count); } +static void apic_update_lvtt(struct kvm_lapic *apic) +{ + u32 timer_mode = kvm_apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask; + + if (apic->lapic_timer.timer_mode != timer_mode) { + apic->lapic_timer.timer_mode = timer_mode; + hrtimer_cancel(&apic->lapic_timer.timer); + } +} + static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; @@ -1298,6 +1309,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); } + apic_update_lvtt(apic); atomic_set(&apic->lapic_timer.pending, 0); } @@ -1330,20 +1342,13 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; - case APIC_LVTT: { - u32 timer_mode = val & apic->lapic_timer.timer_mode_mask; - - if (apic->lapic_timer.timer_mode != timer_mode) { - apic->lapic_timer.timer_mode = timer_mode; - hrtimer_cancel(&apic->lapic_timer.timer); - } - + case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); apic_set_reg(apic, APIC_LVTT, val); + apic_update_lvtt(apic); break; - } case APIC_TMICT: if (apic_lvtt_tscdeadline(apic)) @@ -1576,7 +1581,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); - apic->lapic_timer.timer_mode = 0; + apic_update_lvtt(apic); apic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); @@ -1802,6 +1807,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); + apic_update_lvtt(apic); update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; -- cgit v1.2.1 From 6c6685055a285de53f18fbf6611687291b57ccd6 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 20 May 2015 11:53:39 -0700 Subject: kconfig: add xenconfig defconfig helper This lets you build a kernel which can support xen dom0 or xen guests on i386, x86-64 and arm64 by just using: make xenconfig You can start from an allnoconfig and then switch to xenconfig. This also splits out the options which are available currently to be built with x86 and 'make ARCH=arm64' under a shared config. Technically xen supports a dom0 kernel and also a guest kernel configuration but upon review with the xen team since we don't have many dom0 options its best to just combine these two into one. A few generic notes: we enable both of these: CONFIG_INET=y CONFIG_BINFMT_ELF=y although technically not required given you likely will end up with a pretty useless system otherwise. A few architectural differences worth noting: $ make allnoconfig; make xenconfig > /dev/null ; \ grep XEN .config > 64-bit-config $ make ARCH=i386 allnoconfig; make ARCH=i386 xenconfig > /dev/null; \ grep XEN .config > 32-bit-config $ make ARCH=arm64 allnoconfig; make ARCH=arm64 xenconfig > /dev/null; \ grep XEN .config > arm64-config Since the options are already split up with a generic config and architecture specific configs you anything on the x86 configs are known to only work right now on x86. For instance arm64 doesn't support MEMORY_HOTPLUG yet as such although we try to enabe it generically arm64 doesn't have it yet, so we leave the xen specific kconfig option XEN_BALLOON_MEMORY_HOTPLUG on x86's config file to set expecations correctly. Then on x86 we have differences between i386 and x86-64. The difference between 64-bit-config and 32-bit-config is you don't get XEN_MCE_LOG as this is only supported on 64-bit. You also do not get on i386 XEN_BALLOON_MEMORY_HOTPLUG, there does not seem to be any technical reasons to not allow this but I gave up after a few attempts. Cc: Josh Triplett Cc: Borislav Petkov Cc: Pekka Enberg Cc: David Rientjes Cc: Michal Marek Cc: Randy Dunlap Cc: penberg@kernel.org Cc: levinsasha928@gmail.com Cc: mtosatti@redhat.com Cc: fengguang.wu@intel.com Cc: David Vrabel Cc: Ian Campbell Cc: Konrad Rzeszutek Wilk Cc: xen-devel@lists.xenproject.org Acked-by: Stefano Stabellini Acked-by: Julien Grall Acked-by: Michal Marek Acked-by: David Rientjes Reviewed-by: Josh Triplett Signed-off-by: Luis R. Rodriguez Signed-off-by: David Vrabel --- arch/x86/configs/xen.config | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 arch/x86/configs/xen.config (limited to 'arch/x86') diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config new file mode 100644 index 000000000000..d9fc7139fd46 --- /dev/null +++ b/arch/x86/configs/xen.config @@ -0,0 +1,28 @@ +# global x86 required specific stuff +# On 32-bit HIGHMEM4G is not allowed +CONFIG_HIGHMEM64G=y +CONFIG_64BIT=y + +# These enable us to allow some of the +# not so generic stuff below +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_X86_MCE=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_CPU_FREQ=y + +# x86 xen specific config options +CONFIG_XEN_PVH=y +CONFIG_XEN_MAX_DOMAIN_MEMORY=500 +CONFIG_XEN_SAVE_RESTORE=y +# CONFIG_XEN_DEBUG_FS is not set +CONFIG_XEN_MCE_LOG=y +CONFIG_XEN_ACPI_PROCESSOR=m +# x86 specific backend drivers +CONFIG_XEN_PCIDEV_BACKEND=m +# x86 specific frontend drivers +CONFIG_XEN_PCIDEV_FRONTEND=m +# depends on MEMORY_HOTPLUG, arm64 doesn't enable this yet, +# move to generic config if it ever does. +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y -- cgit v1.2.1 From 4711e2f9caedaa07e7cdcb5e058a18762d6be9b1 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 1 May 2015 20:05:49 -0400 Subject: x86: don't use module_init in non-modular intel_mid_vrtc.c The X86_INTEL_MID option is bool, and hence this code is either present or absent. It will never be modular, so using module_init as an alias for __initcall is rather misleading. Fix this up now, so that we can relocate module_init from init.h into module.h in the future. If we don't do this, we'd have to add module.h to obviously non-modular code, and that would be a worse thing. Note that direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of device_initcall directly in this change means that the runtime impact is zero -- it will remain at level 6 in initcall ordering. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Paul Gortmaker --- arch/x86/platform/intel-mid/intel_mid_vrtc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c index 32947ba0f62d..ee40fcb6e54d 100644 --- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c +++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c @@ -173,5 +173,4 @@ static int __init intel_mid_device_create(void) return platform_device_register(&vrtc_device); } - -module_init(intel_mid_device_create); +device_initcall(intel_mid_device_create); -- cgit v1.2.1 From d54b675a6b0007422dc13acbecdb1ca2b1a53aeb Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 1 May 2015 20:05:49 -0400 Subject: x86: don't use module_init in non-modular devicetree.c code The devicetree.o is built for "OF" -- which is bool, and hence this code is either present or absent. It will never be modular, so using module_init as an alias for __initcall can be somewhat misleading. Fix this up now, so that we can relocate module_init from init.h into module.h in the future. If we don't do this, we'd have to add module.h to obviously non-modular code, and that would be a worse thing. Note that direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of device_initcall directly in this change means that the runtime impact is zero -- it will remain at level 6 in initcall ordering. Reported-by: kbuild test robot Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Paul Gortmaker --- arch/x86/kernel/devicetree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 6367a780cc8c..3743b92089de 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -65,7 +65,7 @@ static int __init add_bus_probe(void) return of_platform_bus_probe(NULL, ce4100_ids, NULL); } -module_init(add_bus_probe); +device_initcall(add_bus_probe); #ifdef CONFIG_PCI struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) -- cgit v1.2.1 From 1206f53589237b7e00b9b0a4e42815f14aedad2d Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 1 May 2015 20:08:21 -0400 Subject: x86: don't use module_init for non-modular core bootflag code The bootflag.o is obj-y (always built in). It will never be modular, so using module_init as an alias for __initcall is somewhat misleading. Fix this up now, so that we can relocate module_init from init.h into module.h in the future. If we don't do this, we'd have to add module.h to obviously non-modular code, and that would be a worse thing. Note that direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of arch_initcall (which makes sense for arch code) will thus change this registration from level 6-device to level 3-arch (i.e. slightly earlier). However no observable impact of that small difference has been observed during testing, or is expected. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Paul Gortmaker --- arch/x86/kernel/bootflag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 5de7f4c56971..52c8e3c7789d 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -98,4 +98,4 @@ static int __init sbf_init(void) return 0; } -module_init(sbf_init); +arch_initcall(sbf_init); -- cgit v1.2.1 From ca41d24cf56458a699b44e918c5a19b7077df422 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 1 May 2015 21:57:34 -0400 Subject: x86: perf_event_intel_bts.c: use arch_initcall to hook in enabling This was using module_init, but the current Kconfig situation is as follows: In arch/x86/kernel/cpu/Makefile: obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o and in arch/x86/Kconfig.cpu: config CPU_SUP_INTEL default y bool "Support Intel processors" if PROCESSOR_SELECT So currently, the end user can not build this code into a module. If in the future, there is desire for this to be modular, then it can be changed to include and use module_init. But currently, in the non-modular case, a module_init becomes a device_initcall. But this really isn't a device, so we should choose a more appropriate initcall bucket to put it in. The obvious choice here seems to be arch_initcall, but that does make it earlier than it was currently through device_initcall. As long as perf_pmu_register() is functional, we should be OK. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Paul Gortmaker --- arch/x86/kernel/cpu/perf_event_intel_bts.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index ac1f0c55f379..7b2fec86dead 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -521,5 +521,4 @@ static __init int bts_init(void) return perf_pmu_register(&bts_pmu, "intel_bts", -1); } - -module_init(bts_init); +arch_initcall(bts_init); -- cgit v1.2.1 From 5b00c1eb94e5936e5bf5cdd9ad1ddfbed0c39159 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 1 May 2015 21:57:34 -0400 Subject: x86: perf_event_intel_pt.c: use arch_initcall to hook in enabling This was using module_init, but the current Kconfig situation is as follows: In arch/x86/kernel/cpu/Makefile: obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o and in arch/x86/Kconfig.cpu: config CPU_SUP_INTEL default y bool "Support Intel processors" if PROCESSOR_SELECT So currently, the end user can not build this code into a module. If in the future, there is desire for this to be modular, then it can be changed to include and use module_init. But currently, in the non-modular case, a module_init becomes a device_initcall. But this really isn't a device, so we should choose a more appropriate initcall bucket to put it in. The obvious choice here seems to be arch_initcall, but that does make it earlier than it was currently through device_initcall. As long as perf_pmu_register() is functional, we should be OK. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Paul Gortmaker --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 123ff1bb2f60..0d52d76f1397 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -1106,5 +1106,4 @@ static __init int pt_init(void) return ret; } - -module_init(pt_init); +arch_initcall(pt_init); -- cgit v1.2.1 From 70c4f78b23c69013c908222d55a07c96fea4bba1 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 1 May 2015 20:13:42 -0400 Subject: x86: replace __init_or_module with __init in non-modular vsmp_64.c The __init_or_module is from commit 05e12e1c4c09cd35ac9f4e6af1e ("x86: fix 27-rc crash on vsmp due to paravirt during module load"). But as of commit 70511134f61bd6e5eed19f767381f9fb3e762d49 ("Revert "x86: don't compile vsmp_64 for 32bit") this file became obj-y and hence is now only for built-in. That makes any "_or_module" support no longer necessary. We need to distinguish between the two in order to do some header reorganization between init.h and module.h and we don't want to be including module.h in non-modular code. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Paul Gortmaker --- arch/x86/kernel/vsmp_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index ee22c1d93ae5..b034b1b14b9c 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -72,7 +72,7 @@ asmlinkage __visible void vsmp_irq_enable(void) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable); -static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, +static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { switch (type) { -- cgit v1.2.1 From 3d9fecf6bfb8b12bc2f9a4c7109895a2a2bb9436 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 9 Jun 2015 17:31:38 -0500 Subject: x86/PCI: Use host bridge _CRS info on systems with >32 bit addressing We enable _CRS on all systems from 2008 and later. On older systems, we ignore _CRS and assume the whole physical address space (excluding RAM and other devices) is available for PCI devices, but on systems that support physical address spaces larger than 4GB, it's doubtful that the area above 4GB is really available for PCI. After d56dbf5bab8c ("PCI: Allocate 64-bit BARs above 4G when possible"), we try to use that space above 4GB *first*, so we're more likely to put a device there. On Juan's Toshiba Satellite Pro U200, BIOS left the graphics, sound, 1394, and card reader devices unassigned (but only after Windows had been booted). Only the sound device had a 64-bit BAR, so it was the only device placed above 4GB, and hence the only device that didn't work. Keep _CRS enabled even on pre-2008 systems if they support physical address space larger than 4GB. Fixes: d56dbf5bab8c ("PCI: Allocate 64-bit BARs above 4G when possible") Reported-and-tested-by: Juan Dayer Reported-and-tested-by: Alan Horsfield Link: https://bugzilla.kernel.org/show_bug.cgi?id=99221 Link: https://bugzilla.opensuse.org/show_bug.cgi?id=907092 Signed-off-by: Bjorn Helgaas CC: stable@vger.kernel.org # v3.14+ --- arch/x86/pci/acpi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d8e225826489..2ae7ce240e02 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -132,8 +132,10 @@ void __init pci_acpi_crs_quirks(void) { int year; - if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) - pci_use_crs = false; + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) { + if (iomem_resource.end <= 0xffffffff) + pci_use_crs = false; + } dmi_check_system(pci_crs_quirks); -- cgit v1.2.1 From b58d930750135d6c5b8e5aa084c0e9303c78c286 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 15 Jun 2015 17:40:01 +0800 Subject: x86/platform/intel/baytrail: Add comments about why we disabled HPET on Baytrail This question has been asked many times, and finally I found the official document which explains the problem of HPET on Baytrail, that it will halt in deep idle states. Signed-off-by: Feng Tang Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: john.stultz@linaro.org Cc: len.brown@intel.com Cc: matthew.lee@intel.com Link: http://lkml.kernel.org/r/1434361201-31743-1-git-send-email-feng.tang@intel.com [ Prettified things a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index fe9f0b79a18b..5cb9a4d6f623 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -627,8 +627,12 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, /* - * HPET on current version of Baytrail platform has accuracy - * problems, disable it for now: + * HPET on the current version of the Baytrail platform has accuracy + * problems: it will halt in deep idle state - so we disable it. + * + * More details can be found in section 18.10.1.3 of the datasheet: + * + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, -- cgit v1.2.1 From cc2749e4095cbbcb35518fb2db5e926b85c3f25f Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 15 Jun 2015 10:28:15 +0200 Subject: x86/cpu/amd: Give access to the number of nodes in a physical package Stash the number of nodes in a physical processor package locally and add an accessor to be called by interested parties. The first user is the MCE injection module which uses it to find the node base core in a package for injecting a certain type of errors. Signed-off-by: Aravind Gopalakrishnan [ Rewrote the commit message, merged it with the accessor patch and unified naming. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jacob Shin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-edac Cc: mchehab@osg.samsung.com Link: http://lkml.kernel.org/r/1433868317-18417-2-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/amd.c | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 23ba6765b718..9aa52fd13a78 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -946,6 +946,7 @@ static inline int mpx_disable_management(struct task_struct *tsk) #endif /* CONFIG_X86_INTEL_MPX */ extern u16 amd_get_nb_id(int cpu); +extern u32 amd_get_nodes_per_socket(void); static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 94e7051fba1a..56cae1964a81 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -19,6 +19,13 @@ #include "cpu.h" +/* + * nodes_per_socket: Stores the number of nodes per socket. + * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX + * Node Identifiers[10:8] + */ +static u32 nodes_per_socket = 1; + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -291,7 +298,7 @@ static int nearby_node(int apicid) #ifdef CONFIG_X86_HT static void amd_get_topology(struct cpuinfo_x86 *c) { - u32 nodes, cores_per_cu = 1; + u32 cores_per_cu = 1; u8 node_id; int cpu = smp_processor_id(); @@ -300,7 +307,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - nodes = ((ecx >> 8) & 7) + 1; + nodes_per_socket = ((ecx >> 8) & 7) + 1; node_id = ecx & 7; /* get compute unit information */ @@ -311,18 +318,18 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes = ((value >> 3) & 7) + 1; + nodes_per_socket = ((value >> 3) & 7) + 1; node_id = value & 7; } else return; /* fixup multi-node processor information */ - if (nodes > 1) { + if (nodes_per_socket > 1) { u32 cores_per_node; u32 cus_per_node; set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cores_per_node = c->x86_max_cores / nodes; + cores_per_node = c->x86_max_cores / nodes_per_socket; cus_per_node = cores_per_node / cores_per_cu; /* store NodeID, use llc_shared_map to store sibling info */ @@ -366,6 +373,12 @@ u16 amd_get_nb_id(int cpu) } EXPORT_SYMBOL_GPL(amd_get_nb_id); +u32 amd_get_nodes_per_socket(void) +{ + return nodes_per_socket; +} +EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket); + static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA -- cgit v1.2.1 From 4b36f1a4139c9284df74c0f5d7655603d67807df Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 11 Jun 2015 13:52:22 -0700 Subject: perf/x86: Add more Broadwell model numbers This patch adds additional model numbers for Broadwell to perf. Support for Broadwell with Iris Pro (Intel Core i7-57xxC) and support for Broadwell Server Xeon. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434055942-28253-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a1e35c9f06b9..afdd7c08d0fc 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -3253,6 +3253,8 @@ __init int intel_pmu_init(void) case 61: /* 14nm Broadwell Core-M */ case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); -- cgit v1.2.1 From 6b099d9b040b0f3d0aec05b560d7caf879af5077 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 11 Jun 2015 15:13:56 +0300 Subject: perf/x86/intel/bts: Fix DS area sharing with x86_pmu events Currently, the intel_bts driver relies on the DS area allocated by the x86_pmu code in its event_init() path, which is a bug: creating a BTS event while no x86_pmu events are present results in a NULL pointer dereference. The same DS area is also used by PEBS sampling, which makes it quite a bit trickier to have a separate one for intel_bts' purposes. This patch makes intel_bts driver use the same DS allocation and reference counting code as x86_pmu to make sure it is always present when either intel_bts or x86_pmu need it. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Link: http://lkml.kernel.org/r/1434024837-9916-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 52 +++++++++++++++++++----------- arch/x86/kernel/cpu/perf_event.h | 4 +++ arch/x86/kernel/cpu/perf_event_intel_bts.c | 9 ++++++ 3 files changed, 46 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4f7001f28936..aa4e3a74e541 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -270,11 +270,7 @@ msr_fail: static void hw_perf_event_destroy(struct perf_event *event) { - if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { - release_pmc_hardware(); - release_ds_buffers(); - mutex_unlock(&pmc_reserve_mutex); - } + x86_release_hardware(); } void hw_perf_lbr_event_destroy(struct perf_event *event) @@ -324,6 +320,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return x86_pmu_extra_regs(val, event); } +int x86_reserve_hardware(void) +{ + int err = 0; + + if (!atomic_inc_not_zero(&active_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_events) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + reserve_ds_buffers(); + } + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmc_reserve_mutex); + } + + return err; +} + +void x86_release_hardware(void) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); + release_ds_buffers(); + mutex_unlock(&pmc_reserve_mutex); + } +} + /* * Check if we can create event of a certain type (that no conflicting events * are present). @@ -336,9 +361,10 @@ int x86_add_exclusive(unsigned int what) return 0; mutex_lock(&pmc_reserve_mutex); - for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) + for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) goto out; + } atomic_inc(&x86_pmu.lbr_exclusive[what]); ret = 0; @@ -527,19 +553,7 @@ static int __x86_pmu_event_init(struct perf_event *event) if (!x86_pmu_initialized()) return -ENODEV; - err = 0; - if (!atomic_inc_not_zero(&active_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_events) == 0) { - if (!reserve_pmc_hardware()) - err = -EBUSY; - else - reserve_ds_buffers(); - } - if (!err) - atomic_inc(&active_events); - mutex_unlock(&pmc_reserve_mutex); - } + err = x86_reserve_hardware(); if (err) return err; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ef78516850fb..f068695eaca0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -703,6 +703,10 @@ int x86_add_exclusive(unsigned int what); void x86_del_exclusive(unsigned int what); +int x86_reserve_hardware(void); + +void x86_release_hardware(void); + void hw_perf_lbr_event_destroy(struct perf_event *event); int x86_setup_perfctr(struct perf_event *event); diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index ac1f0c55f379..7795f3f8b1d5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -483,17 +483,26 @@ static int bts_event_add(struct perf_event *event, int mode) static void bts_event_destroy(struct perf_event *event) { + x86_release_hardware(); x86_del_exclusive(x86_lbr_exclusive_bts); } static int bts_event_init(struct perf_event *event) { + int ret; + if (event->attr.type != bts_pmu.type) return -ENOENT; if (x86_add_exclusive(x86_lbr_exclusive_bts)) return -EBUSY; + ret = x86_reserve_hardware(); + if (ret) { + x86_del_exclusive(x86_lbr_exclusive_bts); + return ret; + } + event->destroy = bts_event_destroy; return 0; -- cgit v1.2.1 From 1b7b938f181742f899a2f31c280f79dabef6ddb6 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 9 Jun 2015 13:03:26 +0300 Subject: perf/x86/intel: Fix PMI handling for Intel PT Intel PT is a separate PMU and it is not using any of the x86_pmu code paths, which means in particular that the active_events counter remains intact when new PT events are created. However, PT uses the generic x86_pmu PMI handler for its PMI handling needs. The problem here is that the latter checks active_events and in case of it being zero, exits without calling the actual x86_pmu.handle_nmi(), which results in unknown NMI errors and massive data loss for PT. The effect is not visible if there are other perf events in the system at the same time that keep active_events counter non-zero, for instance if the NMI watchdog is running, so one needs to disable it to reproduce the problem. At the same time, the active_events counter besides doing what the name suggests also implicitly serves as a PMC hardware and DS area reference counter. This patch adds a separate reference counter for the PMC hardware, leaving active_events for actually counting the events and makes sure it also counts PT and BTS events. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Link: http://lkml.kernel.org/r/87k2v92t0s.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index aa4e3a74e541..6be186ccef46 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -135,6 +135,7 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) } static atomic_t active_events; +static atomic_t pmc_refcount; static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC @@ -271,6 +272,7 @@ msr_fail: static void hw_perf_event_destroy(struct perf_event *event) { x86_release_hardware(); + atomic_dec(&active_events); } void hw_perf_lbr_event_destroy(struct perf_event *event) @@ -324,16 +326,16 @@ int x86_reserve_hardware(void) { int err = 0; - if (!atomic_inc_not_zero(&active_events)) { + if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_events) == 0) { + if (atomic_read(&pmc_refcount) == 0) { if (!reserve_pmc_hardware()) err = -EBUSY; else reserve_ds_buffers(); } if (!err) - atomic_inc(&active_events); + atomic_inc(&pmc_refcount); mutex_unlock(&pmc_reserve_mutex); } @@ -342,7 +344,7 @@ int x86_reserve_hardware(void) void x86_release_hardware(void) { - if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); mutex_unlock(&pmc_reserve_mutex); @@ -371,12 +373,24 @@ int x86_add_exclusive(unsigned int what) out: mutex_unlock(&pmc_reserve_mutex); + + /* + * Assuming that all exclusive events will share the PMI handler + * (which checks active_events for whether there is work to do), + * we can bump active_events counter right here, except for + * x86_lbr_exclusive_lbr events that go through x86_pmu_event_init() + * path, which already bumps active_events for them. + */ + if (!ret && what != x86_lbr_exclusive_lbr) + atomic_inc(&active_events); + return ret; } void x86_del_exclusive(unsigned int what) { atomic_dec(&x86_pmu.lbr_exclusive[what]); + atomic_dec(&active_events); } int x86_setup_perfctr(struct perf_event *event) @@ -557,6 +571,7 @@ static int __x86_pmu_event_init(struct perf_event *event) if (err) return err; + atomic_inc(&active_events); event->destroy = hw_perf_event_destroy; event->hw.idx = -1; @@ -1429,6 +1444,10 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) u64 finish_clock; int ret; + /* + * All PMUs/events that share this PMI handler should make sure to + * increment active_events for their events. + */ if (!atomic_read(&active_events)) return NMI_DONE; -- cgit v1.2.1 From 2c33645d366d13b969d936b68b9f4875b1fdddea Mon Sep 17 00:00:00 2001 From: "Palik, Imre" Date: Mon, 8 Jun 2015 14:46:49 +0200 Subject: perf/x86: Honor the architectural performance monitoring version Architectural performance monitoring, version 1, doesn't support fixed counters. Currently, even if a hypervisor advertises support for architectural performance monitoring version 1, perf may still try to use the fixed counters, as the constraints are set up based on the CPU model. This patch ensures that perf honors the architectural performance monitoring version returned by CPUID, and it only uses the fixed counters for version 2 and above. (Some of the ideas in this patch came from Peter Zijlstra.) Signed-off-by: Imre Palik Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Anthony Liguori Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433767609-1039-1-git-send-email-imrep.amz@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index afdd7c08d0fc..2813ea0f142e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -3324,13 +3324,13 @@ __init int intel_pmu_init(void) * counter, so do not extend mask to generic counters */ for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != FIXED_EVENT_FLAGS - || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { - continue; + if (c->cmask == FIXED_EVENT_FLAGS + && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; } - - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; + c->idxmsk64 &= + ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); + c->weight = hweight64(c->idxmsk64); } } -- cgit v1.2.1 From 04c17341b42699a5859a8afa05e64ba08a4e5235 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 19 Jun 2015 13:49:06 +0200 Subject: x86/boot: Fix overflow warning with 32-bit binutils When building the kernel with 32-bit binutils built with support only for the i386 target, we get the following warning: arch/x86/kernel/head_32.S:66: Warning: shift count out of range (32 is not between 0 and 31) The problem is that in that case, binutils' internal type representation is 32-bit wide and the shift range overflows. In order to fix this, manipulate the shift expression which creates the 4GiB constant to not overflow the shift count. Suggested-by: Michael Matz Reported-and-tested-by: Enrico Mioso Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 53eeb226657c..7e429c99c728 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -62,9 +62,16 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif -/* Number of possible pages in the lowmem region */ -LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) - +/* + * Number of possible pages in the lowmem region. + * + * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a + * gas warning about overflowing shift count when gas has been compiled + * with only a host target support using a 32-bit type for internal + * representation. + */ +LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT) + /* Enough space to fit pagetables for the low memory linear map */ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT -- cgit v1.2.1 From f104765b4f81fd74d69e0eb161e89096deade2db Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Thu, 11 Jun 2015 02:05:33 -0400 Subject: KVM: nSVM: Check for NRIPS support before updating control field If hardware doesn't support DecodeAssist - a feature that provides more information about the intercept in the VMCB, KVM decodes the instruction and then updates the next_rip vmcb control field. However, NRIP support itself depends on cpuid Fn8000_000A_EDX[NRIPS]. Since skip_emulated_instruction() doesn't verify nrip support before accepting control.next_rip as valid, avoid writing this field if support isn't present. Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 680753186489..e7685af399e4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -511,8 +511,10 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->vmcb->control.next_rip != 0) + if (svm->vmcb->control.next_rip != 0) { + WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS)); svm->next_rip = svm->vmcb->control.next_rip; + } if (!svm->next_rip) { if (emulate_instruction(vcpu, EMULTYPE_SKIP) != @@ -4329,7 +4331,9 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, break; } - vmcb->control.next_rip = info->next_rip; + /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ + if (static_cpu_has(X86_FEATURE_NRIPS)) + vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; vmexit = nested_svm_exit_handled(svm); -- cgit v1.2.1 From b18d5431acc7a2fd22767925f3a6f597aa4bd29e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:21 +0800 Subject: KVM: x86: fix CR0.CD virtualization Currently, CR0.CD is not checked when we virtualize memory cache type for noncoherent_dma guests, this patch fixes it by : - setting UC for all memory if CR0.CD = 1 - zapping all the last sptes in MMU if CR0.CD is changed Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++---------- arch/x86/kvm/x86.c | 4 ++++ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06a186b07631..2764381a1b02 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8628,7 +8628,8 @@ static int get_ept_level(void) static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - u64 ret; + u8 cache; + u64 ipat = 0; /* For VT-d and EPT combination * 1. MMIO: always map as UC @@ -8641,16 +8642,27 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ - if (is_mmio) - ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) - ret = kvm_get_guest_memory_type(vcpu, gfn) << - VMX_EPT_MT_EPTE_SHIFT; - else - ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IPAT_BIT; + if (is_mmio) { + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } - return ret; + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + ipat = VMX_EPT_IPAT_BIT; + cache = MTRR_TYPE_WRBACK; + goto exit; + } + + if (kvm_read_cr0(vcpu) & X86_CR0_CD) { + ipat = VMX_EPT_IPAT_BIT; + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } + + cache = kvm_get_guest_memory_type(vcpu, gfn); + +exit: + return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } static int vmx_get_lpage_level(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43f0df7ddc9c..43fdb10c1580 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -621,6 +621,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); + + if ((cr0 ^ old_cr0) & X86_CR0_CD) + kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr0); -- cgit v1.2.1 From ff53604b40b439cbb235f89bda99839ca81d3b9d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:22 +0800 Subject: KVM: x86: move MTRR related code to a separate file MTRR code locates in x86.c and mmu.c so that move them to a separate file to make the organization more clearer and it will be the place where we fully implement vMTRR Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/mmu.c | 103 ------------ arch/x86/kvm/mtrr.c | 335 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 214 +------------------------ arch/x86/kvm/x86.h | 3 + 7 files changed, 342 insertions(+), 318 deletions(-) create mode 100644 arch/x86/kvm/mtrr.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8ca32cfbcbd8..cf8d320dc7a5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -894,7 +894,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); -u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); struct kvm_irq_mask_notifier { void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 16e8f962eaad..470dc6c9d409 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,7 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o + i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c88f0e443669..532aad251cca 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2437,109 +2437,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -/* - * The function is based on mtrr_type_lookup() in - * arch/x86/kernel/cpu/mtrr/generic.c - */ -static int get_mtrr_type(struct mtrr_state_type *mtrr_state, - u64 start, u64 end) -{ - u64 base, mask; - u8 prev_match, curr_match; - int i, num_var_ranges = KVM_NR_VAR_MTRR; - - /* MTRR is completely disabled, use UC for all of physical memory. */ - if (!(mtrr_state->enabled & 0x2)) - return MTRR_TYPE_UNCACHABLE; - - /* Make end inclusive end, instead of exclusive */ - end--; - - /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state->have_fixed && (mtrr_state->enabled & 0x1) && - (start < 0x100000)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0x1000000) { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state->fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ - prev_match = 0xFF; - for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; - - if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) - continue; - - base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + - (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + - (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); - - start_state = ((start & mask) == (base & mask)); - end_state = ((end & mask) == (base & mask)); - if (start_state != end_state) - return 0xFE; - - if ((start & mask) != (base & mask)) - continue; - - curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { - prev_match = curr_match; - continue; - } - - if (prev_match == MTRR_TYPE_UNCACHABLE || - curr_match == MTRR_TYPE_UNCACHABLE) - return MTRR_TYPE_UNCACHABLE; - - if ((prev_match == MTRR_TYPE_WRBACK && - curr_match == MTRR_TYPE_WRTHROUGH) || - (prev_match == MTRR_TYPE_WRTHROUGH && - curr_match == MTRR_TYPE_WRBACK)) { - prev_match = MTRR_TYPE_WRTHROUGH; - curr_match = MTRR_TYPE_WRTHROUGH; - } - - if (prev_match != curr_match) - return MTRR_TYPE_UNCACHABLE; - } - - if (prev_match != 0xFF) - return prev_match; - - return mtrr_state->def_type; -} - -u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u8 mtrr; - - mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, - (gfn << PAGE_SHIFT) + PAGE_SIZE); - if (mtrr == 0xfe || mtrr == 0xff) - mtrr = MTRR_TYPE_WRBACK; - return mtrr; -} -EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); - static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c new file mode 100644 index 000000000000..fb2f7e10e8bc --- /dev/null +++ b/arch/x86/kvm/mtrr.c @@ -0,0 +1,335 @@ +/* + * vMTRR implementation + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright(C) 2015 Intel Corporation. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + * Marcelo Tosatti + * Paolo Bonzini + * Xiao Guangrong + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include + +#include "cpuid.h" +#include "mmu.h" + +static bool msr_mtrr_valid(unsigned msr) +{ + switch (msr) { + case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MSR_MTRRfix64K_00000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_CR_PAT: + return true; + case 0x2f8: + return true; + } + return false; +} + +static bool valid_pat_type(unsigned t) +{ + return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ +} + +static bool valid_mtrr_type(unsigned t) +{ + return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ +} + +bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int i; + u64 mask; + + if (!msr_mtrr_valid(msr)) + return false; + + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return false; + return valid_mtrr_type(data & 0xff); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8 ; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } + + /* variable MTRRs */ + WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + + mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + if ((msr & 1) == 0) { + /* MTRR base */ + if (!valid_mtrr_type(data & 0xff)) + return false; + mask |= 0xf00; + } else + /* MTRR mask */ + mask |= 0x7ff; + if (data & mask) { + kvm_inject_gp(vcpu, 0); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(kvm_mtrr_valid); + +static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state; + unsigned char mtrr_enabled = mtrr_state->enabled; + gfn_t start, end, mask; + int index; + bool is_fixed = true; + + if (msr == MSR_IA32_CR_PAT || !tdp_enabled || + !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return; + + if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType) + return; + + switch (msr) { + case MSR_MTRRfix64K_00000: + start = 0x0; + end = 0x80000; + break; + case MSR_MTRRfix16K_80000: + start = 0x80000; + end = 0xa0000; + break; + case MSR_MTRRfix16K_A0000: + start = 0xa0000; + end = 0xc0000; + break; + case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: + index = msr - MSR_MTRRfix4K_C0000; + start = 0xc0000 + index * (32 << 10); + end = start + (32 << 10); + break; + case MSR_MTRRdefType: + is_fixed = false; + start = 0x0; + end = ~0ULL; + break; + default: + /* variable range MTRRs. */ + is_fixed = false; + index = (msr - 0x200) / 2; + start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) + + (mtrr_state->var_ranges[index].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) + + (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK); + mask |= ~0ULL << cpuid_maxphyaddr(vcpu); + + end = ((start & mask) | ~mask) + 1; + } + + if (is_fixed && !(mtrr_enabled & 0x1)) + return; + + kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); +} + +int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + + if (!kvm_mtrr_valid(vcpu, msr, data)) + return 1; + + if (msr == MSR_MTRRdefType) { + vcpu->arch.mtrr_state.def_type = data; + vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pt = data; + } + + update_mtrr(vcpu, msr); + return 0; +} + +int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + + if (!msr_mtrr_valid(msr)) + return 1; + + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pdata = *pt; + } + + return 0; +} + +/* + * The function is based on mtrr_type_lookup() in + * arch/x86/kernel/cpu/mtrr/generic.c + */ +static int get_mtrr_type(struct mtrr_state_type *mtrr_state, + u64 start, u64 end) +{ + u64 base, mask; + u8 prev_match, curr_match; + int i, num_var_ranges = KVM_NR_VAR_MTRR; + + /* MTRR is completely disabled, use UC for all of physical memory. */ + if (!(mtrr_state->enabled & 0x2)) + return MTRR_TYPE_UNCACHABLE; + + /* Make end inclusive end, instead of exclusive */ + end--; + + /* Look in fixed ranges. Just return the type as per start */ + if (mtrr_state->have_fixed && (mtrr_state->enabled & 0x1) && + (start < 0x100000)) { + int idx; + + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0x1000000) { + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state->fixed_ranges[idx]; + } + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state; + + if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + + (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + + (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + if (start_state != end_state) + return 0xFE; + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) + return MTRR_TYPE_UNCACHABLE; + } + + if (prev_match != 0xFF) + return prev_match; + + return mtrr_state->def_type; +} + +u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u8 mtrr; + + mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, + (gfn << PAGE_SHIFT) + PAGE_SIZE); + if (mtrr == 0xfe || mtrr == 0xff) + mtrr = MTRR_TYPE_WRBACK; + return mtrr; +} +EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2764381a1b02..44eafdb440c9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8659,7 +8659,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) goto exit; } - cache = kvm_get_guest_memory_type(vcpu, gfn); + cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43fdb10c1580..e2bc79821b45 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include /* Ugh! */ @@ -1803,179 +1802,6 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } -static bool msr_mtrr_valid(unsigned msr) -{ - switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: - case MSR_MTRRfix64K_00000: - case MSR_MTRRfix16K_80000: - case MSR_MTRRfix16K_A0000: - case MSR_MTRRfix4K_C0000: - case MSR_MTRRfix4K_C8000: - case MSR_MTRRfix4K_D0000: - case MSR_MTRRfix4K_D8000: - case MSR_MTRRfix4K_E0000: - case MSR_MTRRfix4K_E8000: - case MSR_MTRRfix4K_F0000: - case MSR_MTRRfix4K_F8000: - case MSR_MTRRdefType: - case MSR_IA32_CR_PAT: - return true; - case 0x2f8: - return true; - } - return false; -} - -static bool valid_pat_type(unsigned t) -{ - return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ -} - -static bool valid_mtrr_type(unsigned t) -{ - return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ -} - -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - int i; - u64 mask; - - if (!msr_mtrr_valid(msr)) - return false; - - if (msr == MSR_IA32_CR_PAT) { - for (i = 0; i < 8; i++) - if (!valid_pat_type((data >> (i * 8)) & 0xff)) - return false; - return true; - } else if (msr == MSR_MTRRdefType) { - if (data & ~0xcff) - return false; - return valid_mtrr_type(data & 0xff); - } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { - for (i = 0; i < 8 ; i++) - if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) - return false; - return true; - } - - /* variable MTRRs */ - WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); - - mask = (~0ULL) << cpuid_maxphyaddr(vcpu); - if ((msr & 1) == 0) { - /* MTRR base */ - if (!valid_mtrr_type(data & 0xff)) - return false; - mask |= 0xf00; - } else - /* MTRR mask */ - mask |= 0x7ff; - if (data & mask) { - kvm_inject_gp(vcpu, 0); - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(kvm_mtrr_valid); - -static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) -{ - struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state; - unsigned char mtrr_enabled = mtrr_state->enabled; - gfn_t start, end, mask; - int index; - bool is_fixed = true; - - if (msr == MSR_IA32_CR_PAT || !tdp_enabled || - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) - return; - - if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType) - return; - - switch (msr) { - case MSR_MTRRfix64K_00000: - start = 0x0; - end = 0x80000; - break; - case MSR_MTRRfix16K_80000: - start = 0x80000; - end = 0xa0000; - break; - case MSR_MTRRfix16K_A0000: - start = 0xa0000; - end = 0xc0000; - break; - case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: - index = msr - MSR_MTRRfix4K_C0000; - start = 0xc0000 + index * (32 << 10); - end = start + (32 << 10); - break; - case MSR_MTRRdefType: - is_fixed = false; - start = 0x0; - end = ~0ULL; - break; - default: - /* variable range MTRRs. */ - is_fixed = false; - index = (msr - 0x200) / 2; - start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) + - (mtrr_state->var_ranges[index].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) + - (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK); - mask |= ~0ULL << cpuid_maxphyaddr(vcpu); - - end = ((start & mask) | ~mask) + 1; - } - - if (is_fixed && !(mtrr_enabled & 0x1)) - return; - - kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); -} - -static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - - if (!kvm_mtrr_valid(vcpu, msr, data)) - return 1; - - if (msr == MSR_MTRRdefType) { - vcpu->arch.mtrr_state.def_type = data; - vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; - } else if (msr == MSR_MTRRfix64K_00000) - p[0] = data; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - p[1 + msr - MSR_MTRRfix16K_80000] = data; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - p[3 + msr - MSR_MTRRfix4K_C0000] = data; - else if (msr == MSR_IA32_CR_PAT) - vcpu->arch.pat = data; - else { /* Variable MTRRs */ - int idx, is_mtrr_mask; - u64 *pt; - - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; - if (!is_mtrr_mask) - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; - else - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; - *pt = data; - } - - update_mtrr(vcpu, msr); - return 0; -} - static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 mcg_cap = vcpu->arch.mcg_cap; @@ -2267,7 +2093,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) __func__, data); break; case 0x200 ... 0x2ff: - return set_msr_mtrr(vcpu, msr, data); + return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: @@ -2479,42 +2305,6 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } EXPORT_SYMBOL_GPL(kvm_get_msr); -static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - - if (!msr_mtrr_valid(msr)) - return 1; - - if (msr == MSR_MTRRdefType) - *pdata = vcpu->arch.mtrr_state.def_type + - (vcpu->arch.mtrr_state.enabled << 10); - else if (msr == MSR_MTRRfix64K_00000) - *pdata = p[0]; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; - else if (msr == MSR_IA32_CR_PAT) - *pdata = vcpu->arch.pat; - else { /* Variable MTRRs */ - int idx, is_mtrr_mask; - u64 *pt; - - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; - if (!is_mtrr_mask) - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; - else - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; - *pdata = *pt; - } - - return 0; -} - static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -2656,7 +2446,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x500 | KVM_NR_VAR_MTRR; break; case 0x200 ... 0x2ff: - return get_msr_mtrr(vcpu, msr_info->index, &msr_info->data); + return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; break; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 01a1d011e073..aeb0bb2f1df4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -162,7 +162,10 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ | XSTATE_BNDREGS | XSTATE_BNDCSR \ -- cgit v1.2.1 From eb839917a75207b89799e3500211163cb6de0dea Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:23 +0800 Subject: KVM: MTRR: handle MSR_MTRRcap in kvm_mtrr_get_msr MSR_MTRRcap is a MTRR msr so move the handler to the common place, also add some comments to make the hard code more readable Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 12 ++++++++++++ arch/x86/kvm/x86.c | 2 -- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index fb2f7e10e8bc..a05846a524d9 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -199,6 +199,18 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + /* MSR_MTRRcap is a readonly MSR. */ + if (msr == MSR_MTRRcap) { + /* + * SMRR = 0 + * WC = 1 + * FIX = 1 + * VCNT = KVM_NR_VAR_MTRR + */ + *pdata = 0x500 | KVM_NR_VAR_MTRR; + return 0; + } + if (!msr_mtrr_valid(msr)) return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e2bc79821b45..fb6c9a11b5c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2443,8 +2443,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x100000000ULL; break; case MSR_MTRRcap: - msr_info->data = 0x500 | KVM_NR_VAR_MTRR; - break; case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ -- cgit v1.2.1 From 70109e7d9d4ac7182786ddf7cd53bc651a157896 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:24 +0800 Subject: KVM: MTRR: remove mtrr_state.have_fixed vMTRR does not depend on any host MTRR feature and fixed MTRRs have always been implemented, so drop this field Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 9 ++++++++- arch/x86/kvm/mtrr.c | 7 +++---- arch/x86/kvm/x86.c | 1 - 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cf8d320dc7a5..cbf9f076f57c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -342,6 +342,13 @@ enum { KVM_DEBUGREG_RELOAD = 4, }; +struct kvm_mtrr { + struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; + mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; + unsigned char enabled; + mtrr_type def_type; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -472,7 +479,7 @@ struct kvm_vcpu_arch { bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ - struct mtrr_state_type mtrr_state; + struct kvm_mtrr mtrr_state; u64 pat; unsigned switch_db_regs; diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index a05846a524d9..ce061fffc077 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) { - struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state; + struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; unsigned char mtrr_enabled = mtrr_state->enabled; gfn_t start, end, mask; int index; @@ -247,7 +247,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * The function is based on mtrr_type_lookup() in * arch/x86/kernel/cpu/mtrr/generic.c */ -static int get_mtrr_type(struct mtrr_state_type *mtrr_state, +static int get_mtrr_type(struct kvm_mtrr *mtrr_state, u64 start, u64 end) { u64 base, mask; @@ -262,8 +262,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state, end--; /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state->have_fixed && (mtrr_state->enabled & 0x1) && - (start < 0x100000)) { + if ((mtrr_state->enabled & 0x1) && (start < 0x100000)) { int idx; if (start < 0x80000) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fb6c9a11b5c1..2ffad7f2a28e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7379,7 +7379,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; - vcpu->arch.mtrr_state.have_fixed = 1; r = vcpu_load(vcpu); if (r) return r; -- cgit v1.2.1 From 910a6aae4e2e45855efc4a268e43eed2d8445575 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:25 +0800 Subject: KVM: MTRR: exactly define the size of variable MTRRs Only KVM_NR_VAR_MTRR variable MTRRs are available in KVM guest Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cbf9f076f57c..fe9cbe49e272 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -343,7 +343,7 @@ enum { }; struct kvm_mtrr { - struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; + struct mtrr_var_range var_ranges[KVM_NR_VAR_MTRR]; mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; unsigned char enabled; mtrr_type def_type; -- cgit v1.2.1 From 10fac2dc2b3b549d371d67f57193362b6bcc6dfd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:26 +0800 Subject: KVM: MTRR: clean up mtrr default type Drop kvm_mtrr->enable, omit the decode/code workload and get rid of all the hard code Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/mtrr.c | 40 ++++++++++++++++++++++++++++------------ 2 files changed, 29 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fe9cbe49e272..8d43006a6df0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -345,8 +345,7 @@ enum { struct kvm_mtrr { struct mtrr_var_range var_ranges[KVM_NR_VAR_MTRR]; mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; - unsigned char enabled; - mtrr_type def_type; + u64 deftype; }; struct kvm_vcpu_arch { diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index ce061fffc077..191de6435f23 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -22,6 +22,10 @@ #include "cpuid.h" #include "mmu.h" +#define IA32_MTRR_DEF_TYPE_E (1ULL << 11) +#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) +#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -101,10 +105,24 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } EXPORT_SYMBOL_GPL(kvm_mtrr_valid); +static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) +{ + return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E); +} + +static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state) +{ + return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE); +} + +static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) +{ + return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; +} + static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - unsigned char mtrr_enabled = mtrr_state->enabled; gfn_t start, end, mask; int index; bool is_fixed = true; @@ -113,7 +131,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) !kvm_arch_has_noncoherent_dma(vcpu->kvm)) return; - if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType) + if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) return; switch (msr) { @@ -152,7 +170,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) end = ((start & mask) | ~mask) + 1; } - if (is_fixed && !(mtrr_enabled & 0x1)) + if (is_fixed && !fixed_mtrr_is_enabled(mtrr_state)) return; kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); @@ -165,10 +183,9 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!kvm_mtrr_valid(vcpu, msr, data)) return 1; - if (msr == MSR_MTRRdefType) { - vcpu->arch.mtrr_state.def_type = data; - vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; - } else if (msr == MSR_MTRRfix64K_00000) + if (msr == MSR_MTRRdefType) + vcpu->arch.mtrr_state.deftype = data; + else if (msr == MSR_MTRRfix64K_00000) p[0] = data; else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) p[1 + msr - MSR_MTRRfix16K_80000] = data; @@ -215,8 +232,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 1; if (msr == MSR_MTRRdefType) - *pdata = vcpu->arch.mtrr_state.def_type + - (vcpu->arch.mtrr_state.enabled << 10); + *pdata = vcpu->arch.mtrr_state.deftype; else if (msr == MSR_MTRRfix64K_00000) *pdata = p[0]; else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) @@ -255,14 +271,14 @@ static int get_mtrr_type(struct kvm_mtrr *mtrr_state, int i, num_var_ranges = KVM_NR_VAR_MTRR; /* MTRR is completely disabled, use UC for all of physical memory. */ - if (!(mtrr_state->enabled & 0x2)) + if (!mtrr_is_enabled(mtrr_state)) return MTRR_TYPE_UNCACHABLE; /* Make end inclusive end, instead of exclusive */ end--; /* Look in fixed ranges. Just return the type as per start */ - if ((mtrr_state->enabled & 0x1) && (start < 0x100000)) { + if (fixed_mtrr_is_enabled(mtrr_state) && (start < 0x100000)) { int idx; if (start < 0x80000) { @@ -330,7 +346,7 @@ static int get_mtrr_type(struct kvm_mtrr *mtrr_state, if (prev_match != 0xFF) return prev_match; - return mtrr_state->def_type; + return mtrr_default_type(mtrr_state); } u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) -- cgit v1.2.1 From 86fd52701cfae760711fb02a03c1ab8c80ea72f3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:27 +0800 Subject: KVM: MTRR: do not split 64 bits MSR content Variable MTRR MSRs are 64 bits which are directly accessed with full length, no reason to split them to two 32 bits Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 ++++++- arch/x86/kvm/mtrr.c | 32 ++++++++++---------------------- 2 files changed, 16 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8d43006a6df0..f73554874845 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -342,8 +342,13 @@ enum { KVM_DEBUGREG_RELOAD = 4, }; +struct kvm_mtrr_range { + u64 base; + u64 mask; +}; + struct kvm_mtrr { - struct mtrr_var_range var_ranges[KVM_NR_VAR_MTRR]; + struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR]; mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; u64 deftype; }; diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 191de6435f23..c4b80a46a530 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -161,10 +161,8 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) /* variable range MTRRs. */ is_fixed = false; index = (msr - 0x200) / 2; - start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) + - (mtrr_state->var_ranges[index].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) + - (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK); + start = mtrr_state->var_ranges[index].base & PAGE_MASK; + mask = mtrr_state->var_ranges[index].mask & PAGE_MASK; mask |= ~0ULL << cpuid_maxphyaddr(vcpu); end = ((start & mask) | ~mask) + 1; @@ -195,17 +193,13 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) vcpu->arch.pat = data; else { /* Variable MTRRs */ int idx, is_mtrr_mask; - u64 *pt; idx = (msr - 0x200) / 2; is_mtrr_mask = msr - 0x200 - 2 * idx; if (!is_mtrr_mask) - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + vcpu->arch.mtrr_state.var_ranges[idx].base = data; else - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; - *pt = data; + vcpu->arch.mtrr_state.var_ranges[idx].mask = data; } update_mtrr(vcpu, msr); @@ -243,17 +237,13 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) *pdata = vcpu->arch.pat; else { /* Variable MTRRs */ int idx, is_mtrr_mask; - u64 *pt; idx = (msr - 0x200) / 2; is_mtrr_mask = msr - 0x200 - 2 * idx; if (!is_mtrr_mask) - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + *pdata = vcpu->arch.mtrr_state.var_ranges[idx].base; else - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; - *pdata = *pt; + *pdata = vcpu->arch.mtrr_state.var_ranges[idx].mask; } return 0; @@ -305,13 +295,11 @@ static int get_mtrr_type(struct kvm_mtrr *mtrr_state, for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state; - if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) + if (!(mtrr_state->var_ranges[i].mask & (1 << 11))) continue; - base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + - (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + - (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); + base = mtrr_state->var_ranges[i].base & PAGE_MASK; + mask = mtrr_state->var_ranges[i].mask & PAGE_MASK; start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); @@ -321,7 +309,7 @@ static int get_mtrr_type(struct kvm_mtrr *mtrr_state, if ((start & mask) != (base & mask)) continue; - curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; + curr_match = mtrr_state->var_ranges[i].base & 0xff; if (prev_match == 0xFF) { prev_match = curr_match; continue; -- cgit v1.2.1 From 3f3f78b61462fdc62469334c259b1e5555ebd1ed Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:28 +0800 Subject: KVM: MTRR: improve kvm_mtrr_get_guest_memory_type - kvm_mtrr_get_guest_memory_type() only checks one page in MTRRs so that it's unnecessary to check to see if the range is partially covered in MTRR - optimize the check of overlap memory type and add some comments to explain the precedence Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 94 ++++++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index c4b80a46a530..2ea12130e74a 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -249,24 +249,22 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -/* - * The function is based on mtrr_type_lookup() in - * arch/x86/kernel/cpu/mtrr/generic.c - */ -static int get_mtrr_type(struct kvm_mtrr *mtrr_state, - u64 start, u64 end) +u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) { - u64 base, mask; - u8 prev_match, curr_match; - int i, num_var_ranges = KVM_NR_VAR_MTRR; + struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; + u64 base, mask, start; + int i, num_var_ranges, type; + const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK) + | (1 << MTRR_TYPE_WRTHROUGH); + + start = gfn_to_gpa(gfn); + num_var_ranges = KVM_NR_VAR_MTRR; + type = -1; /* MTRR is completely disabled, use UC for all of physical memory. */ if (!mtrr_is_enabled(mtrr_state)) return MTRR_TYPE_UNCACHABLE; - /* Make end inclusive end, instead of exclusive */ - end--; - /* Look in fixed ranges. Just return the type as per start */ if (fixed_mtrr_is_enabled(mtrr_state) && (start < 0x100000)) { int idx; @@ -291,9 +289,8 @@ static int get_mtrr_type(struct kvm_mtrr *mtrr_state, * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; + int curr_type; if (!(mtrr_state->var_ranges[i].mask & (1 << 11))) continue; @@ -301,50 +298,57 @@ static int get_mtrr_type(struct kvm_mtrr *mtrr_state, base = mtrr_state->var_ranges[i].base & PAGE_MASK; mask = mtrr_state->var_ranges[i].mask & PAGE_MASK; - start_state = ((start & mask) == (base & mask)); - end_state = ((end & mask) == (base & mask)); - if (start_state != end_state) - return 0xFE; - if ((start & mask) != (base & mask)) continue; - curr_match = mtrr_state->var_ranges[i].base & 0xff; - if (prev_match == 0xFF) { - prev_match = curr_match; + /* + * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR + * Precedences. + */ + + curr_type = mtrr_state->var_ranges[i].base & 0xff; + if (type == -1) { + type = curr_type; continue; } - if (prev_match == MTRR_TYPE_UNCACHABLE || - curr_match == MTRR_TYPE_UNCACHABLE) + /* + * If two or more variable memory ranges match and the + * memory types are identical, then that memory type is + * used. + */ + if (type == curr_type) + continue; + + /* + * If two or more variable memory ranges match and one of + * the memory types is UC, the UC memory type used. + */ + if (curr_type == MTRR_TYPE_UNCACHABLE) return MTRR_TYPE_UNCACHABLE; - if ((prev_match == MTRR_TYPE_WRBACK && - curr_match == MTRR_TYPE_WRTHROUGH) || - (prev_match == MTRR_TYPE_WRTHROUGH && - curr_match == MTRR_TYPE_WRBACK)) { - prev_match = MTRR_TYPE_WRTHROUGH; - curr_match = MTRR_TYPE_WRTHROUGH; + /* + * If two or more variable memory ranges match and the + * memory types are WT and WB, the WT memory type is used. + */ + if (((1 << type) & wt_wb_mask) && + ((1 << curr_type) & wt_wb_mask)) { + type = MTRR_TYPE_WRTHROUGH; + continue; } - if (prev_match != curr_match) - return MTRR_TYPE_UNCACHABLE; + /* + * For overlaps not defined by the above rules, processor + * behavior is undefined. + */ + + /* We use WB for this undefined behavior. :( */ + return MTRR_TYPE_WRBACK; } - if (prev_match != 0xFF) - return prev_match; + if (type != -1) + return type; return mtrr_default_type(mtrr_state); } - -u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u8 mtrr; - - mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, - (gfn << PAGE_SHIFT) + PAGE_SIZE); - if (mtrr == 0xfe || mtrr == 0xff) - mtrr = MTRR_TYPE_WRBACK; - return mtrr; -} EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); -- cgit v1.2.1 From de9aef5e1ad6e504d6992df17dfa193e8ba5cc34 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:29 +0800 Subject: KVM: MTRR: introduce fixed_mtrr_segment table This table summarizes the information of fixed MTRRs and introduce some APIs to abstract its operation which helps us to clean up the code and will be used in later patches Signed-off-by: Xiao Guangrong [Change range_size to range_shift, in order to avoid udivdi3 errors. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 200 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 147 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 2ea12130e74a..cfeeaef9e419 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -120,12 +120,132 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; } +/* +* Three terms are used in the following code: +* - segment, it indicates the address segments covered by fixed MTRRs. +* - unit, it corresponds to the MSR entry in the segment. +* - range, a range is covered in one memory cache type. +*/ +struct fixed_mtrr_segment { + u64 start; + u64 end; + + int range_shift; + + /* the start position in kvm_mtrr.fixed_ranges[]. */ + int range_start; +}; + +static struct fixed_mtrr_segment fixed_seg_table[] = { + /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */ + { + .start = 0x0, + .end = 0x80000, + .range_shift = 16, /* 64K */ + .range_start = 0, + }, + + /* + * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units, + * 16K fixed mtrr. + */ + { + .start = 0x80000, + .end = 0xc0000, + .range_shift = 14, /* 16K */ + .range_start = 8, + }, + + /* + * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units, + * 4K fixed mtrr. + */ + { + .start = 0xc0000, + .end = 0x100000, + .range_shift = 12, /* 12K */ + .range_start = 24, + } +}; + +/* + * The size of unit is covered in one MSR, one MSR entry contains + * 8 ranges so that unit size is always 8 * 2^range_shift. + */ +static u64 fixed_mtrr_seg_unit_size(int seg) +{ + return 8 << fixed_seg_table[seg].range_shift; +} + +static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) +{ + switch (msr) { + case MSR_MTRRfix64K_00000: + *seg = 0; + *unit = 0; + break; + case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: + *seg = 1; + *unit = msr - MSR_MTRRfix16K_80000; + break; + case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: + *seg = 2; + *unit = msr - MSR_MTRRfix4K_C0000; + break; + default: + return false; + } + + return true; +} + +static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end) +{ + struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; + u64 unit_size = fixed_mtrr_seg_unit_size(seg); + + *start = mtrr_seg->start + unit * unit_size; + *end = *start + unit_size; + WARN_ON(*end > mtrr_seg->end); +} + +static int fixed_mtrr_seg_unit_range_index(int seg, int unit) +{ + struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; + + WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg) + > mtrr_seg->end); + + /* each unit has 8 ranges. */ + return mtrr_seg->range_start + 8 * unit; +} + +static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end) +{ + int seg, unit; + + if (!fixed_msr_to_seg_unit(msr, &seg, &unit)) + return false; + + fixed_mtrr_seg_unit_range(seg, unit, start, end); + return true; +} + +static int fixed_msr_to_range_index(u32 msr) +{ + int seg, unit; + + if (!fixed_msr_to_seg_unit(msr, &seg, &unit)) + return -1; + + return fixed_mtrr_seg_unit_range_index(seg, unit); +} + static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end, mask; int index; - bool is_fixed = true; if (msr == MSR_IA32_CR_PAT || !tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) @@ -134,32 +254,15 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) return; - switch (msr) { - case MSR_MTRRfix64K_00000: - start = 0x0; - end = 0x80000; - break; - case MSR_MTRRfix16K_80000: - start = 0x80000; - end = 0xa0000; - break; - case MSR_MTRRfix16K_A0000: - start = 0xa0000; - end = 0xc0000; - break; - case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: - index = msr - MSR_MTRRfix4K_C0000; - start = 0xc0000 + index * (32 << 10); - end = start + (32 << 10); - break; - case MSR_MTRRdefType: - is_fixed = false; + /* fixed MTRRs. */ + if (fixed_msr_to_range(msr, &start, &end)) { + if (!fixed_mtrr_is_enabled(mtrr_state)) + return; + } else if (msr == MSR_MTRRdefType) { start = 0x0; end = ~0ULL; - break; - default: + } else { /* variable range MTRRs. */ - is_fixed = false; index = (msr - 0x200) / 2; start = mtrr_state->var_ranges[index].base & PAGE_MASK; mask = mtrr_state->var_ranges[index].mask & PAGE_MASK; @@ -168,38 +271,32 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) end = ((start & mask) | ~mask) + 1; } - if (is_fixed && !fixed_mtrr_is_enabled(mtrr_state)) - return; - kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); } int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + int index; if (!kvm_mtrr_valid(vcpu, msr, data)) return 1; - if (msr == MSR_MTRRdefType) + index = fixed_msr_to_range_index(msr); + if (index >= 0) + *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data; + else if (msr == MSR_MTRRdefType) vcpu->arch.mtrr_state.deftype = data; - else if (msr == MSR_MTRRfix64K_00000) - p[0] = data; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - p[1 + msr - MSR_MTRRfix16K_80000] = data; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - p[3 + msr - MSR_MTRRfix4K_C0000] = data; else if (msr == MSR_IA32_CR_PAT) vcpu->arch.pat = data; else { /* Variable MTRRs */ - int idx, is_mtrr_mask; + int is_mtrr_mask; - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; + index = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * index; if (!is_mtrr_mask) - vcpu->arch.mtrr_state.var_ranges[idx].base = data; + vcpu->arch.mtrr_state.var_ranges[index].base = data; else - vcpu->arch.mtrr_state.var_ranges[idx].mask = data; + vcpu->arch.mtrr_state.var_ranges[index].mask = data; } update_mtrr(vcpu, msr); @@ -208,7 +305,7 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { - u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + int index; /* MSR_MTRRcap is a readonly MSR. */ if (msr == MSR_MTRRcap) { @@ -225,25 +322,22 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) if (!msr_mtrr_valid(msr)) return 1; - if (msr == MSR_MTRRdefType) + index = fixed_msr_to_range_index(msr); + if (index >= 0) + *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index]; + else if (msr == MSR_MTRRdefType) *pdata = vcpu->arch.mtrr_state.deftype; - else if (msr == MSR_MTRRfix64K_00000) - *pdata = p[0]; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; else if (msr == MSR_IA32_CR_PAT) *pdata = vcpu->arch.pat; else { /* Variable MTRRs */ - int idx, is_mtrr_mask; + int is_mtrr_mask; - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; + index = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * index; if (!is_mtrr_mask) - *pdata = vcpu->arch.mtrr_state.var_ranges[idx].base; + *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; else - *pdata = vcpu->arch.mtrr_state.var_ranges[idx].mask; + *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; } return 0; -- cgit v1.2.1 From a13842dc668b40daef4327294a6d3bdc8bd30276 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:30 +0800 Subject: KVM: MTRR: introduce var_mtrr_range It gets the range for the specified variable MTRR Signed-off-by: Xiao Guangrong [Simplify boolean operations. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index cfeeaef9e419..b081e3ba186f 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -241,10 +241,25 @@ static int fixed_msr_to_range_index(u32 msr) return fixed_mtrr_seg_unit_range_index(seg, unit); } +static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end) +{ + u64 mask; + + *start = range->base & PAGE_MASK; + + mask = range->mask & PAGE_MASK; + mask |= ~0ULL << boot_cpu_data.x86_phys_bits; + + /* This cannot overflow because writing to the reserved bits of + * variable MTRRs causes a #GP. + */ + *end = (*start | ~mask) + 1; +} + static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - gfn_t start, end, mask; + gfn_t start, end; int index; if (msr == MSR_IA32_CR_PAT || !tdp_enabled || @@ -264,11 +279,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) } else { /* variable range MTRRs. */ index = (msr - 0x200) / 2; - start = mtrr_state->var_ranges[index].base & PAGE_MASK; - mask = mtrr_state->var_ranges[index].mask & PAGE_MASK; - mask |= ~0ULL << cpuid_maxphyaddr(vcpu); - - end = ((start & mask) | ~mask) + 1; + var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end); } kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); -- cgit v1.2.1 From 19efffa244071ccd0385b240d03adb38feaab04e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:31 +0800 Subject: KVM: MTRR: sort variable MTRRs Sort all valid variable MTRRs based on its base address, it will help us to check a range to see if it's fully contained in variable MTRRs Signed-off-by: Xiao Guangrong [Fix list insertion sort, simplify var_mtrr_range_is_valid to just test the V bit. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/mtrr.c | 50 ++++++++++++++++++++++++++++++++--------- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 1 + 4 files changed, 45 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f73554874845..f2d60cce7595 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -345,12 +345,15 @@ enum { struct kvm_mtrr_range { u64 base; u64 mask; + struct list_head node; }; struct kvm_mtrr { struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR]; mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; u64 deftype; + + struct list_head head; }; struct kvm_vcpu_arch { diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index b081e3ba186f..faa582488bb8 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -285,6 +285,39 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); } +static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range) +{ + return (range->mask & (1 << 11)) != 0; +} + +static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; + struct kvm_mtrr_range *tmp, *cur; + int index, is_mtrr_mask; + + index = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * index; + cur = &mtrr_state->var_ranges[index]; + + /* remove the entry if it's in the list. */ + if (var_mtrr_range_is_valid(cur)) + list_del(&mtrr_state->var_ranges[index].node); + + if (!is_mtrr_mask) + cur->base = data; + else + cur->mask = data; + + /* add it to the list if it's enabled. */ + if (var_mtrr_range_is_valid(cur)) { + list_for_each_entry(tmp, &mtrr_state->head, node) + if (cur->base >= tmp->base) + break; + list_add_tail(&cur->node, &tmp->node); + } +} + int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int index; @@ -299,16 +332,8 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) vcpu->arch.mtrr_state.deftype = data; else if (msr == MSR_IA32_CR_PAT) vcpu->arch.pat = data; - else { /* Variable MTRRs */ - int is_mtrr_mask; - - index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; - if (!is_mtrr_mask) - vcpu->arch.mtrr_state.var_ranges[index].base = data; - else - vcpu->arch.mtrr_state.var_ranges[index].mask = data; - } + else + set_var_mtrr_msr(vcpu, msr, data); update_mtrr(vcpu, msr); return 0; @@ -354,6 +379,11 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } +void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu) +{ + INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head); +} + u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2ffad7f2a28e..6574fa36cb65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7379,13 +7379,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; + kvm_vcpu_mtrr_init(vcpu); r = vcpu_load(vcpu); if (r) return r; kvm_vcpu_reset(vcpu, false); kvm_mmu_setup(vcpu); vcpu_put(vcpu); - return r; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index aeb0bb2f1df4..0e4727c49279 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -162,6 +162,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); -- cgit v1.2.1 From f7bfb57b3e89ff89c0da9f93dedab89f68d6ca27 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:32 +0800 Subject: KVM: MTRR: introduce fixed_mtrr_addr_* functions Two functions are introduced: - fixed_mtrr_addr_to_seg() translates the address to the fixed MTRR segment - fixed_mtrr_addr_seg_to_range_index() translates the address to the index of kvm_mtrr.fixed_ranges[] They will be used in the later patch Signed-off-by: Xiao Guangrong [Adjust for range_size->range_shift change. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index faa582488bb8..b5346d20f458 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -241,6 +241,31 @@ static int fixed_msr_to_range_index(u32 msr) return fixed_mtrr_seg_unit_range_index(seg, unit); } +static int fixed_mtrr_addr_to_seg(u64 addr) +{ + struct fixed_mtrr_segment *mtrr_seg; + int seg, seg_num = ARRAY_SIZE(fixed_seg_table); + + for (seg = 0; seg < seg_num; seg++) { + mtrr_seg = &fixed_seg_table[seg]; + if (mtrr_seg->start >= addr && addr < mtrr_seg->end) + return seg; + } + + return -1; +} + +static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg) +{ + struct fixed_mtrr_segment *mtrr_seg; + int index; + + mtrr_seg = &fixed_seg_table[seg]; + index = mtrr_seg->range_start; + index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift; + return index; +} + static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end) { u64 mask; -- cgit v1.2.1 From f571c0973e4b8c888e049b6842e4b4f93b5c609c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:33 +0800 Subject: KVM: MTRR: introduce mtrr_for_each_mem_type It walks all MTRRs and gets all the memory cache type setting for the specified range also it checks if the range is fully covered by MTRRs Signed-off-by: Xiao Guangrong [Adjust for range_size->range_shift change. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index b5346d20f458..4c6ac464063e 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -220,6 +220,15 @@ static int fixed_mtrr_seg_unit_range_index(int seg, int unit) return mtrr_seg->range_start + 8 * unit; } +static int fixed_mtrr_seg_end_range_index(int seg) +{ + struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; + int n; + + n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift; + return mtrr_seg->range_start + n - 1; +} + static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end) { int seg, unit; @@ -266,6 +275,14 @@ static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg) return index; } +static u64 fixed_mtrr_range_end_addr(int seg, int index) +{ + struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; + int pos = index - mtrr_seg->range_start; + + return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift); +} + static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end) { u64 mask; @@ -409,6 +426,177 @@ void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head); } +struct mtrr_iter { + /* input fields. */ + struct kvm_mtrr *mtrr_state; + u64 start; + u64 end; + + /* output fields. */ + int mem_type; + /* [start, end) is not fully covered in MTRRs? */ + bool partial_map; + + /* private fields. */ + union { + /* used for fixed MTRRs. */ + struct { + int index; + int seg; + }; + + /* used for var MTRRs. */ + struct { + struct kvm_mtrr_range *range; + /* max address has been covered in var MTRRs. */ + u64 start_max; + }; + }; + + bool fixed; +}; + +static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter) +{ + int seg, index; + + if (!fixed_mtrr_is_enabled(iter->mtrr_state)) + return false; + + seg = fixed_mtrr_addr_to_seg(iter->start); + if (seg < 0) + return false; + + iter->fixed = true; + index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg); + iter->index = index; + iter->seg = seg; + return true; +} + +static bool match_var_range(struct mtrr_iter *iter, + struct kvm_mtrr_range *range) +{ + u64 start, end; + + var_mtrr_range(range, &start, &end); + if (!(start >= iter->end || end <= iter->start)) { + iter->range = range; + + /* + * the function is called when we do kvm_mtrr.head walking. + * Range has the minimum base address which interleaves + * [looker->start_max, looker->end). + */ + iter->partial_map |= iter->start_max < start; + + /* update the max address has been covered. */ + iter->start_max = max(iter->start_max, end); + return true; + } + + return false; +} + +static void __mtrr_lookup_var_next(struct mtrr_iter *iter) +{ + struct kvm_mtrr *mtrr_state = iter->mtrr_state; + + list_for_each_entry_continue(iter->range, &mtrr_state->head, node) + if (match_var_range(iter, iter->range)) + return; + + iter->range = NULL; + iter->partial_map |= iter->start_max < iter->end; +} + +static void mtrr_lookup_var_start(struct mtrr_iter *iter) +{ + struct kvm_mtrr *mtrr_state = iter->mtrr_state; + + iter->fixed = false; + iter->start_max = iter->start; + iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); + + __mtrr_lookup_var_next(iter); +} + +static void mtrr_lookup_fixed_next(struct mtrr_iter *iter) +{ + /* terminate the lookup. */ + if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) { + iter->fixed = false; + iter->range = NULL; + return; + } + + iter->index++; + + /* have looked up for all fixed MTRRs. */ + if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges)) + return mtrr_lookup_var_start(iter); + + /* switch to next segment. */ + if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg)) + iter->seg++; +} + +static void mtrr_lookup_var_next(struct mtrr_iter *iter) +{ + __mtrr_lookup_var_next(iter); +} + +static void mtrr_lookup_start(struct mtrr_iter *iter) +{ + if (!mtrr_is_enabled(iter->mtrr_state)) { + iter->partial_map = true; + return; + } + + if (!mtrr_lookup_fixed_start(iter)) + mtrr_lookup_var_start(iter); +} + +static void mtrr_lookup_init(struct mtrr_iter *iter, + struct kvm_mtrr *mtrr_state, u64 start, u64 end) +{ + iter->mtrr_state = mtrr_state; + iter->start = start; + iter->end = end; + iter->partial_map = false; + iter->fixed = false; + iter->range = NULL; + + mtrr_lookup_start(iter); +} + +static bool mtrr_lookup_okay(struct mtrr_iter *iter) +{ + if (iter->fixed) { + iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index]; + return true; + } + + if (iter->range) { + iter->mem_type = iter->range->base & 0xff; + return true; + } + + return false; +} + +static void mtrr_lookup_next(struct mtrr_iter *iter) +{ + if (iter->fixed) + mtrr_lookup_fixed_next(iter); + else + mtrr_lookup_var_next(iter); +} + +#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \ + for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \ + mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_)) + u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; -- cgit v1.2.1 From fa61213746a706dd975661151c35795ca4dd82c2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:34 +0800 Subject: KVM: MTRR: simplify kvm_mtrr_get_guest_memory_type mtrr_for_each_mem_type() is ready now, use it to simplify kvm_mtrr_get_guest_memory_type() Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 64 ++++++++++++++--------------------------------------- 1 file changed, 16 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 4c6ac464063e..1445c4a03a92 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -600,61 +600,23 @@ static void mtrr_lookup_next(struct mtrr_iter *iter) u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - u64 base, mask, start; - int i, num_var_ranges, type; + struct mtrr_iter iter; + u64 start, end; + int type = -1; const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK) | (1 << MTRR_TYPE_WRTHROUGH); start = gfn_to_gpa(gfn); - num_var_ranges = KVM_NR_VAR_MTRR; - type = -1; - - /* MTRR is completely disabled, use UC for all of physical memory. */ - if (!mtrr_is_enabled(mtrr_state)) - return MTRR_TYPE_UNCACHABLE; - - /* Look in fixed ranges. Just return the type as per start */ - if (fixed_mtrr_is_enabled(mtrr_state) && (start < 0x100000)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0x1000000) { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state->fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ - for (i = 0; i < num_var_ranges; ++i) { - int curr_type; - - if (!(mtrr_state->var_ranges[i].mask & (1 << 11))) - continue; - - base = mtrr_state->var_ranges[i].base & PAGE_MASK; - mask = mtrr_state->var_ranges[i].mask & PAGE_MASK; + end = start + PAGE_SIZE; - if ((start & mask) != (base & mask)) - continue; + mtrr_for_each_mem_type(&iter, mtrr_state, start, end) { + int curr_type = iter.mem_type; /* * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR * Precedences. */ - curr_type = mtrr_state->var_ranges[i].base & 0xff; if (type == -1) { type = curr_type; continue; @@ -694,9 +656,15 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) return MTRR_TYPE_WRBACK; } - if (type != -1) - return type; - - return mtrr_default_type(mtrr_state); + /* It is not covered by MTRRs. */ + if (iter.partial_map) { + /* + * We just check one page, partially covered by MTRRs is + * impossible. + */ + WARN_ON(type != -1); + type = mtrr_default_type(mtrr_state); + } + return type; } EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); -- cgit v1.2.1 From 6a39bbc5da27c3b2520876b71e4f7b50f5313503 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 15 Jun 2015 16:55:35 +0800 Subject: KVM: MTRR: do not map huge page for non-consistent range Based on Intel's SDM, mapping huge page which do not have consistent memory cache for each 4k page will cause undefined behavior In order to avoiding this kind of undefined behavior, we force to use 4k pages under this case Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 20 +++++++++++++++++++- arch/x86/kvm/mtrr.c | 29 +++++++++++++++++++++++++++++ arch/x86/kvm/x86.h | 2 ++ 3 files changed, 50 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 532aad251cca..f807496b62c2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3446,6 +3446,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; } +static bool +check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + int page_num = KVM_PAGES_PER_HPAGE(level); + + gfn &= ~(page_num - 1); + + return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); +} + static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, bool prefault) { @@ -3471,9 +3481,17 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (mapping_level_dirty_bitmap(vcpu, gfn) || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) + force_pt_level = 1; + else + force_pt_level = 0; + if (likely(!force_pt_level)) { level = mapping_level(vcpu, gfn); + if (level > PT_DIRECTORY_LEVEL && + !check_hugepage_cache_consistency(vcpu, gfn, level)) + level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); } else level = PT_PAGE_TABLE_LEVEL; diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 1445c4a03a92..de1d2d8062e2 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -668,3 +668,32 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) return type; } EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); + +bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, + int page_num) +{ + struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; + struct mtrr_iter iter; + u64 start, end; + int type = -1; + + start = gfn_to_gpa(gfn); + end = gfn_to_gpa(gfn + page_num); + mtrr_for_each_mem_type(&iter, mtrr_state, start, end) { + if (type == -1) { + type = iter.mem_type; + continue; + } + + if (type != iter.mem_type) + return false; + } + + if (!iter.partial_map) + return true; + + if (type == -1) + return true; + + return type == mtrr_default_type(mtrr_state); +} diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 0e4727c49279..edc8cdcd786b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -167,6 +167,8 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, + int page_num); #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ | XSTATE_BNDREGS | XSTATE_BNDCSR \ -- cgit v1.2.1 From c6702c9dcfe72b63a85e7ae35533c11e2b7c1040 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jun 2015 13:44:45 +0200 Subject: KVM: x86/vPMU: rename a few PMU functions Before introducing a pmu.h header for them, make the naming more consistent. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 12 ++++---- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/pmu.c | 66 ++++++++++++++++++++--------------------- arch/x86/kvm/x86.c | 18 +++++------ 4 files changed, 49 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f2d60cce7595..d92d7edc016b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1198,14 +1198,14 @@ int kvm_is_in_guest(void); void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); -void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); -bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); +void kvm_pmu_refresh(struct kvm_vcpu *vcpu); +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); -int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc); -int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); -void kvm_deliver_pmi(struct kvm_vcpu *vcpu); +int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned pmc); +int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); +void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); +void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); int __x86_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9dadf8d67873..9d69f76aa0fa 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -111,7 +111,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - kvm_pmu_cpuid_update(vcpu); + kvm_pmu_refresh(vcpu); return 0; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 29fbf9dfdc54..d6a4506f62a8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -52,7 +52,7 @@ static inline u64 pmc_bitmask(struct kvm_pmc *pmc) return pmu->counter_bitmask[pmc->type]; } -static inline bool pmc_enabled(struct kvm_pmc *pmc) +static inline bool pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); @@ -87,20 +87,20 @@ static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); } -void kvm_deliver_pmi(struct kvm_vcpu *vcpu) +void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (vcpu->arch.apic) kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } -static void trigger_pmi(struct irq_work *irq_work) +static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, arch.pmu); - kvm_deliver_pmi(vcpu); + kvm_pmu_deliver_pmi(vcpu); } static void kvm_perf_overflow(struct perf_event *perf_event, @@ -138,7 +138,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, } } -static u64 read_pmc(struct kvm_pmc *pmc) +static u64 pmc_read_counter(struct kvm_pmc *pmc) { u64 counter, enabled, running; @@ -153,16 +153,16 @@ static u64 read_pmc(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } -static void stop_counter(struct kvm_pmc *pmc) +static void pmc_stop_counter(struct kvm_pmc *pmc) { if (pmc->perf_event) { - pmc->counter = read_pmc(pmc); + pmc->counter = pmc_read_counter(pmc); perf_event_release_kernel(pmc->perf_event); pmc->perf_event = NULL; } } -static void reprogram_counter(struct kvm_pmc *pmc, u32 type, +static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, bool intr, bool in_tx, bool in_tx_cp) { @@ -224,9 +224,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) pmc->eventsel = eventsel; - stop_counter(pmc); + pmc_stop_counter(pmc); - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc)) + if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; @@ -246,7 +246,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (type == PERF_TYPE_RAW) config = eventsel & X86_RAW_EVENT_MASK; - reprogram_counter(pmc, type, config, + pmc_reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), eventsel & ARCH_PERFMON_EVENTSEL_INT, @@ -259,19 +259,19 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) unsigned en = en_pmi & 0x3; bool pmi = en_pmi & 0x8; - stop_counter(pmc); + pmc_stop_counter(pmc); - if (!en || !pmc_enabled(pmc)) + if (!en || !pmc_is_enabled(pmc)) return; - reprogram_counter(pmc, PERF_TYPE_HARDWARE, + pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, arch_events[fixed_pmc_events[idx]].event_type, !(en & 0x2), /* exclude user */ !(en & 0x1), /* exclude kernel */ pmi, false, false); } -static inline u8 fixed_en_pmi(u64 ctrl, int idx) +static inline u8 fixed_ctrl_field(u64 ctrl, int idx) { return (ctrl >> (idx * 4)) & 0xf; } @@ -281,10 +281,10 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) int i; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - u8 en_pmi = fixed_en_pmi(data, i); + u8 en_pmi = fixed_ctrl_field(data, i); struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); - if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi) + if (fixed_ctrl_field(pmu->fixed_ctr_ctrl, i) == en_pmi) continue; reprogram_fixed_counter(pmc, en_pmi, i); @@ -293,7 +293,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmu->fixed_ctr_ctrl = data; } -static void reprogram_idx(struct kvm_pmu *pmu, int idx) +static void reprogram_counter(struct kvm_pmu *pmu, int idx) { struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx); @@ -305,7 +305,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx) else { int fidx = idx - INTEL_PMC_IDX_FIXED; reprogram_fixed_counter(pmc, - fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); + fixed_ctrl_field(pmu->fixed_ctr_ctrl, fidx), fidx); } } @@ -317,10 +317,10 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) pmu->global_ctrl = data; for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_idx(pmu, bit); + reprogram_counter(pmu, bit); } -bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = &vcpu->arch.pmu; int ret; @@ -362,7 +362,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) default: if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || (pmc = get_fixed_pmc(pmu, index))) { - *data = read_pmc(pmc); + *data = pmc_read_counter(pmc); return 0; } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { *data = pmc->eventsel; @@ -415,7 +415,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (pmc = get_fixed_pmc(pmu, index))) { if (!msr_info->host_initiated) data = (s64)(s32)data; - pmc->counter += data - read_pmc(pmc); + pmc->counter += data - pmc_read_counter(pmc); return 0; } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) @@ -429,7 +429,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } -int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc) +int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned pmc) { struct kvm_pmu *pmu = &vcpu->arch.pmu; bool fixed = pmc & (1u << 30); @@ -438,7 +438,7 @@ int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc) (fixed && pmc >= pmu->nr_arch_fixed_counters); } -int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) +int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) { struct kvm_pmu *pmu = &vcpu->arch.pmu; bool fast_mode = pmc & (1u << 31); @@ -452,7 +452,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) if (fixed && pmc >= pmu->nr_arch_fixed_counters) return 1; counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - ctr = read_pmc(&counters[pmc]); + ctr = pmc_read_counter(&counters[pmc]); if (fast_mode) ctr = (u32)ctr; *data = ctr; @@ -460,7 +460,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) return 0; } -void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) +void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_cpuid_entry2 *entry; @@ -527,8 +527,8 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; } - init_irq_work(&pmu->irq_work, trigger_pmi); - kvm_pmu_cpuid_update(vcpu); + init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); + kvm_pmu_refresh(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) @@ -539,12 +539,12 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) irq_work_sync(&pmu->irq_work); for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; - stop_counter(pmc); + pmc_stop_counter(pmc); pmc->counter = pmc->eventsel = 0; } for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) - stop_counter(&pmu->fixed_counters[i]); + pmc_stop_counter(&pmu->fixed_counters[i]); pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = pmu->global_ovf_ctrl = 0; @@ -555,7 +555,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } -void kvm_handle_pmu_event(struct kvm_vcpu *vcpu) +void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; u64 bitmask; @@ -571,6 +571,6 @@ void kvm_handle_pmu_event(struct kvm_vcpu *vcpu) continue; } - reprogram_idx(pmu, bit); + reprogram_counter(pmu, bit); } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6574fa36cb65..c34b52c828ea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -913,7 +913,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu) u64 data; int err; - err = kvm_pmu_read_pmc(vcpu, ecx, &data); + err = kvm_pmu_rdpmc(vcpu, ecx, &data); if (err) return err; kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); @@ -2231,7 +2231,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pr = true; case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: - if (kvm_pmu_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) @@ -2277,7 +2277,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); - if (kvm_pmu_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", @@ -2435,7 +2435,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: - if (kvm_pmu_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); msr_info->data = 0; break; @@ -2561,7 +2561,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.osvw.status; break; default: - if (kvm_pmu_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); @@ -4966,13 +4966,13 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc); + return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata) { - return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata); + return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata); } static void emulator_halt(struct x86_emulate_ctxt *ctxt) @@ -6542,9 +6542,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) - kvm_handle_pmu_event(vcpu); + kvm_pmu_handle_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) - kvm_deliver_pmi(vcpu); + kvm_pmu_deliver_pmi(vcpu); if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) -- cgit v1.2.1 From 474a5bb944d2ad308a1360dcae72b16b8eecd250 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jun 2015 13:54:23 +0200 Subject: KVM: x86/vPMU: introduce pmu.h header This will be used for private function used by AMD- and Intel-specific PMU implementations. Signed-off-by: Wei Huang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 12 ------------ arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/pmu.c | 8 ++------ arch/x86/kvm/pmu.h | 26 ++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 5 files changed, 30 insertions(+), 18 deletions(-) create mode 100644 arch/x86/kvm/pmu.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d92d7edc016b..534dfa324e35 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1195,18 +1195,6 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); -void kvm_pmu_init(struct kvm_vcpu *vcpu); -void kvm_pmu_destroy(struct kvm_vcpu *vcpu); -void kvm_pmu_reset(struct kvm_vcpu *vcpu); -void kvm_pmu_refresh(struct kvm_vcpu *vcpu); -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned pmc); -int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); -void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); - int __x86_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem); int x86_set_memory_region(struct kvm *kvm, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9d69f76aa0fa..a64cc76ea92f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -24,6 +24,7 @@ #include "lapic.h" #include "mmu.h" #include "trace.h" +#include "pmu.h" static u32 xstate_required_size(u64 xstate_bv, bool compacted) { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d6a4506f62a8..3d2990207be2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -19,13 +19,9 @@ #include "x86.h" #include "cpuid.h" #include "lapic.h" +#include "pmu.h" -static struct kvm_arch_event_perf_mapping { - u8 eventsel; - u8 unit_mask; - unsigned event_type; - bool inexact; -} arch_events[] = { +static struct kvm_event_hw_type_mapping arch_events[] = { /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h new file mode 100644 index 000000000000..19bf0172f93b --- /dev/null +++ b/arch/x86/kvm/pmu.h @@ -0,0 +1,26 @@ +#ifndef __KVM_X86_PMU_H +#define __KVM_X86_PMU_H + +#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) +#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) +#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) + +struct kvm_event_hw_type_mapping { + u8 eventsel; + u8 unit_mask; + unsigned event_type; +}; + +void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); +void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); +int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); +int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx); +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +void kvm_pmu_refresh(struct kvm_vcpu *vcpu); +void kvm_pmu_reset(struct kvm_vcpu *vcpu); +void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_destroy(struct kvm_vcpu *vcpu); + +#endif /* __KVM_X86_PMU_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c34b52c828ea..c386f0bd1830 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -28,6 +28,7 @@ #include "x86.h" #include "cpuid.h" #include "assigned-dev.h" +#include "pmu.h" #include #include -- cgit v1.2.1 From 212dba1267a1be228635014fa35c98a59853de9e Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jun 2015 14:00:33 +0200 Subject: KVM: x86/vPMU: use the new macros to go between PMC, PMU and VCPU Signed-off-by: Wei Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3d2990207be2..38819bc923cf 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -43,14 +43,14 @@ static bool pmc_is_gp(struct kvm_pmc *pmc) static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); return pmu->counter_bitmask[pmc->type]; } static inline bool pmc_is_enabled(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } @@ -91,10 +91,8 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, - irq_work); - struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, - arch.pmu); + struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); kvm_pmu_deliver_pmi(vcpu); } @@ -104,7 +102,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); @@ -115,7 +113,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct perf_sample_data *data, struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); @@ -128,7 +126,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, * NMI context. Do it from irq work instead. */ if (!kvm_is_in_guest()) - irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); + irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } @@ -190,7 +188,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } pmc->perf_event = event; - clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi); + clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); } static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select, @@ -233,7 +231,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, + config = find_arch_event(pmc_to_pmu(pmc), event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; @@ -318,7 +316,7 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int ret; switch (msr) { @@ -339,7 +337,7 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; switch (index) { @@ -370,7 +368,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; u32 index = msr_info->index; u64 data = msr_info->data; @@ -427,7 +425,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned pmc) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = pmc & (1u << 30); pmc &= ~(3u << 30); return (!fixed && pmc >= pmu->nr_arch_gp_counters) || @@ -436,7 +434,7 @@ int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned pmc) int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fast_mode = pmc & (1u << 31); bool fixed = pmc & (1u << 30); struct kvm_pmc *counters; @@ -458,7 +456,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -510,7 +508,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) void kvm_pmu_init(struct kvm_vcpu *vcpu) { int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { @@ -529,7 +527,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; irq_work_sync(&pmu->irq_work); @@ -553,7 +551,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); u64 bitmask; int bit; -- cgit v1.2.1 From e84cfe4ce0113a6c5e3bdf70e20a21552ad3a28d Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jun 2015 14:15:28 +0200 Subject: KVM: x86/vPMU: whitespace and stylistic adjustments in PMU code Signed-off-by: Wei Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 112 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 38819bc923cf..24d213bd42d4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -103,25 +103,31 @@ static void kvm_perf_overflow(struct perf_event *perf_event, { struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + + if (!test_and_set_bit(pmc->idx, + (unsigned long *)&pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } } static void kvm_perf_overflow_intr(struct perf_event *perf_event, - struct perf_sample_data *data, struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + + if (!test_and_set_bit(pmc->idx, + (unsigned long *)&pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + /* * Inject PMI. If vcpu was in a guest mode during NMI PMI * can be ejected on a guest mode re-entry. Otherwise we can't * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until, + * time of vmexit and is not going to re-enter guest mode until * woken up. So we should wake it, but this is impossible from * NMI context. Do it from irq work instead. */ @@ -157,8 +163,9 @@ static void pmc_stop_counter(struct kvm_pmc *pmc) } static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - unsigned config, bool exclude_user, bool exclude_kernel, - bool intr, bool in_tx, bool in_tx_cp) + unsigned config, bool exclude_user, + bool exclude_kernel, bool intr, + bool in_tx, bool in_tx_cp) { struct perf_event *event; struct perf_event_attr attr = { @@ -171,6 +178,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + if (in_tx) attr.config |= HSW_IN_TX; if (in_tx_cp) @@ -182,8 +190,8 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { - printk_once("kvm: pmu event creation failed %ld\n", - PTR_ERR(event)); + printk_once("kvm_pmu: event creation failed %ld\n", + PTR_ERR(event)); return; } @@ -227,10 +235,10 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { + ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_CMASK | + HSW_IN_TX | + HSW_IN_TX_CHECKPOINTED))) { config = find_arch_event(pmc_to_pmu(pmc), event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) @@ -241,28 +249,28 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) config = eventsel & X86_RAW_EVENT_MASK; pmc_reprogram_counter(pmc, type, config, - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT, - (eventsel & HSW_IN_TX), - (eventsel & HSW_IN_TX_CHECKPOINTED)); + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT, + (eventsel & HSW_IN_TX), + (eventsel & HSW_IN_TX_CHECKPOINTED)); } -static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) +static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { - unsigned en = en_pmi & 0x3; - bool pmi = en_pmi & 0x8; + unsigned en_field = ctrl & 0x3; + bool pmi = ctrl & 0x8; pmc_stop_counter(pmc); - if (!en || !pmc_is_enabled(pmc)) + if (!en_field || !pmc_is_enabled(pmc)) return; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - arch_events[fixed_pmc_events[idx]].event_type, - !(en & 0x2), /* exclude user */ - !(en & 0x1), /* exclude kernel */ - pmi, false, false); + arch_events[fixed_pmc_events[idx]].event_type, + !(en_field & 0x2), /* exclude user */ + !(en_field & 0x1), /* exclude kernel */ + pmi, false, false); } static inline u8 fixed_ctrl_field(u64 ctrl, int idx) @@ -275,21 +283,22 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) int i; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - u8 en_pmi = fixed_ctrl_field(data, i); + u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); + u8 new_ctrl = fixed_ctrl_field(data, i); struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); - if (fixed_ctrl_field(pmu->fixed_ctr_ctrl, i) == en_pmi) + if (old_ctrl == new_ctrl) continue; - reprogram_fixed_counter(pmc, en_pmi, i); + reprogram_fixed_counter(pmc, new_ctrl, i); } pmu->fixed_ctr_ctrl = data; } -static void reprogram_counter(struct kvm_pmu *pmu, int idx) +static void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { - struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx); + struct kvm_pmc *pmc = global_idx_to_pmc(pmu, pmc_idx); if (!pmc) return; @@ -297,9 +306,10 @@ static void reprogram_counter(struct kvm_pmu *pmu, int idx) if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int fidx = idx - INTEL_PMC_IDX_FIXED; - reprogram_fixed_counter(pmc, - fixed_ctrl_field(pmu->fixed_ctr_ctrl, fidx), fidx); + int idx = pmc_idx - INTEL_PMC_IDX_FIXED; + u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); + + reprogram_fixed_counter(pmc, ctrl, idx); } } @@ -423,37 +433,43 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned pmc) +/* check if idx is a valid index to access PMU */ +int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = pmc & (1u << 30); - pmc &= ~(3u << 30); - return (!fixed && pmc >= pmu->nr_arch_gp_counters) || - (fixed && pmc >= pmu->nr_arch_fixed_counters); + bool fixed = idx & (1u << 30); + idx &= ~(3u << 30); + return (!fixed && idx >= pmu->nr_arch_gp_counters) || + (fixed && idx >= pmu->nr_arch_fixed_counters); } -int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) +int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fast_mode = pmc & (1u << 31); - bool fixed = pmc & (1u << 30); + bool fast_mode = idx & (1u << 31); + bool fixed = idx & (1u << 30); struct kvm_pmc *counters; - u64 ctr; + u64 ctr_val; - pmc &= ~(3u << 30); - if (!fixed && pmc >= pmu->nr_arch_gp_counters) + idx &= ~(3u << 30); + if (!fixed && idx >= pmu->nr_arch_gp_counters) return 1; - if (fixed && pmc >= pmu->nr_arch_fixed_counters) + if (fixed && idx >= pmu->nr_arch_fixed_counters) return 1; counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - ctr = pmc_read_counter(&counters[pmc]); + + ctr_val = pmc_read_counter(&counters[idx]); if (fast_mode) - ctr = (u32)ctr; - *data = ctr; + ctr_val = (u32)ctr_val; + *data = ctr_val; return 0; } +/* refresh PMU settings. This function generally is called when underlying + * settings are changed (such as changes of PMU CPUID by guest VMs), which + * should rarely happen. + */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); -- cgit v1.2.1 From e5af058aacd55e578b3c57b1582b90c4290b77f9 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jun 2015 15:51:47 +0200 Subject: KVM: x86/vPMU: reorder PMU functions Keep called functions closer to their callers, and init/destroy functions next to each other. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 156 ++++++++++++++++++++++++++--------------------------- 1 file changed, 78 insertions(+), 78 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 24d213bd42d4..f38ad84be87e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -83,12 +83,6 @@ static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); } -void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.apic) - kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); -} - static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); @@ -324,6 +318,65 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) reprogram_counter(pmu, bit); } +void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 bitmask; + int bit; + + bitmask = pmu->reprogram_pmi; + + for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); + + if (unlikely(!pmc || !pmc->perf_event)) { + clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + continue; + } + + reprogram_counter(pmu, bit); + } +} + +/* check if idx is a valid index to access PMU */ +int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + idx &= ~(3u << 30); + return (!fixed && idx >= pmu->nr_arch_gp_counters) || + (fixed && idx >= pmu->nr_arch_fixed_counters); +} + +int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fast_mode = idx & (1u << 31); + bool fixed = idx & (1u << 30); + struct kvm_pmc *counters; + u64 ctr_val; + + idx &= ~(3u << 30); + if (!fixed && idx >= pmu->nr_arch_gp_counters) + return 1; + if (fixed && idx >= pmu->nr_arch_fixed_counters) + return 1; + counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + + ctr_val = pmc_read_counter(&counters[idx]); + if (fast_mode) + ctr_val = (u32)ctr_val; + + *data = ctr_val; + return 0; +} + +void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.apic) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); +} + bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -433,39 +486,6 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } -/* check if idx is a valid index to access PMU */ -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); - idx &= ~(3u << 30); - return (!fixed && idx >= pmu->nr_arch_gp_counters) || - (fixed && idx >= pmu->nr_arch_fixed_counters); -} - -int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fast_mode = idx & (1u << 31); - bool fixed = idx & (1u << 30); - struct kvm_pmc *counters; - u64 ctr_val; - - idx &= ~(3u << 30); - if (!fixed && idx >= pmu->nr_arch_gp_counters) - return 1; - if (fixed && idx >= pmu->nr_arch_fixed_counters) - return 1; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - - ctr_val = pmc_read_counter(&counters[idx]); - if (fast_mode) - ctr_val = (u32)ctr_val; - - *data = ctr_val; - return 0; -} - /* refresh PMU settings. This function generally is called when underlying * settings are changed (such as changes of PMU CPUID by guest VMs), which * should rarely happen. @@ -521,26 +541,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; } -void kvm_pmu_init(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - pmu->gp_counters[i].type = KVM_PMC_GP; - pmu->gp_counters[i].vcpu = vcpu; - pmu->gp_counters[i].idx = i; - } - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { - pmu->fixed_counters[i].type = KVM_PMC_FIXED; - pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; - } - init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); - kvm_pmu_refresh(vcpu); -} - void kvm_pmu_reset(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -560,27 +560,27 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl = 0; } -void kvm_pmu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_pmu_reset(vcpu); -} - -void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) +void kvm_pmu_init(struct kvm_vcpu *vcpu) { + int i; struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - u64 bitmask; - int bit; - - bitmask = pmu->reprogram_pmi; - - for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); - if (unlikely(!pmc || !pmc->perf_event)) { - clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); - continue; - } - - reprogram_counter(pmu, bit); + memset(pmu, 0, sizeof(*pmu)); + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; } + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + pmu->fixed_counters[i].type = KVM_PMC_FIXED; + pmu->fixed_counters[i].vcpu = vcpu; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + } + init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); + kvm_pmu_refresh(vcpu); +} + +void kvm_pmu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_pmu_reset(vcpu); } -- cgit v1.2.1 From bafac298fb20e9ae1305c710d4fd8d20c5911afa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Sat, 20 Jun 2015 11:50:50 +0200 Subject: x86/hpet: Check for irq==0 when allocating hpet MSI interrupts irq == 0 is not a valid irq for a irqdomain MSI allocation, but hpet code checks only for negative return values. Reported-by: Sergey Senozhatsky Cc: Borislav Petkov Link: http://lkml.kernel.org/r/558447AF.30703@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index e2449cf38b06..c47aab35a17e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -578,7 +578,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) continue; irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); - if (irq < 0) + if (irq <= 0) continue; sprintf(hdev->name, "hpet%d", i); -- cgit v1.2.1 From cb17b2a674f2059343f997599b4b001e64eec516 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 21 Jun 2015 16:21:50 +0200 Subject: x86/hpet: Use proper hpet device number for MSI allocation hpet_assign_irq() is called with hpet_device->num as "hardware interrupt number", but hpet_device->num is initialized after the interrupt has been assigned, so it's always 0. As a consequence only the first MSI allocation succeeds, the following ones fail because the "hardware interrupt number" already exists. Move the initialization of dev->num and other fields before the call to hpet_assign_irq(), which is the ordering before the offending commit which introduced that regression. Fixes: "3cb96f0c9733 x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains" Reported-by: Sergey Senozhatsky Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1506211635010.4107@nanos Cc: Jiang Liu Cc: Borislav Petkov --- arch/x86/kernel/hpet.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c47aab35a17e..10757d0a3fcf 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -577,16 +577,17 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) if (!(cfg & HPET_TN_FSB_CAP)) continue; + hdev->flags = 0; + if (cfg & HPET_TN_PERIODIC_CAP) + hdev->flags |= HPET_DEV_PERI_CAP; + sprintf(hdev->name, "hpet%d", i); + hdev->num = i; + irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); if (irq <= 0) continue; - sprintf(hdev->name, "hpet%d", i); - hdev->num = i; hdev->irq = irq; - hdev->flags = 0; - if (cfg & HPET_TN_PERIODIC_CAP) - hdev->flags |= HPET_DEV_PERI_CAP; hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; -- cgit v1.2.1 From ffa64eff956a25548cad0391dbc14c672827be7b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Jun 2015 14:40:03 +0200 Subject: x86: Load __USER_DS into DS/ES after resume Srinivas Pandruvada reported a problem with system resume from suspend-to-RAM on 32-bit x86 systems where the DS register of the CPU is set to __KERNEL_DS instead of __USER_DS on return to user space which cases a General Protection Fault to occur. The issue is that DS is set to __KERNEL_DS by the ACPI resume code path while the SYSEXIT path never reloads DS/ES. It assumes they are still __USER_DS set at the SYSENTER time (Brian Gerst), so if the return to user space happens to be through SYSEXIT, it will lead to the reported GPF. Fix the problem by setting the DS and ES registers to __USER_DS as expected by the SYSEXIT path. Link: https://bugzilla.kernel.org/show_bug.cgi?id=61781 Link: http://marc.info/?l=linux-pm&m=143406648920385&w=2 Acked-by: Pavel Machek Tested-by: Pavel Machek Acked-by: Ingo Molnar Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/wakeup_32.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 665c6b7d2ea9..0c26b1b44e51 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -12,11 +12,13 @@ ENTRY(wakeup_pmode_return) wakeup_pmode_return: movw $__KERNEL_DS, %ax movw %ax, %ss - movw %ax, %ds - movw %ax, %es movw %ax, %fs movw %ax, %gs + movw $__USER_DS, %ax + movw %ax, %ds + movw %ax, %es + # reload the gdt, as we need the full 32 bit address lidt saved_idt lldt saved_ldt -- cgit v1.2.1 From 41aac14a8dee66a720894e5979c2372c0d5afd34 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jun 2015 16:16:59 +0200 Subject: KVM: x86/vPMU: introduce kvm_pmu_msr_idx_to_pmc This function will be part of the kvm_pmu_ops interface. Introduce it already. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f38ad84be87e..bd5dbd9ce0e3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -348,22 +348,34 @@ int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) (fixed && idx >= pmu->nr_arch_fixed_counters); } -int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) +static struct kvm_pmc *kvm_pmu_msr_idx_to_pmc(struct kvm_vcpu *vcpu, + unsigned idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fast_mode = idx & (1u << 31); bool fixed = idx & (1u << 30); struct kvm_pmc *counters; - u64 ctr_val; idx &= ~(3u << 30); if (!fixed && idx >= pmu->nr_arch_gp_counters) - return 1; + return NULL; if (fixed && idx >= pmu->nr_arch_fixed_counters) - return 1; + return NULL; counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - ctr_val = pmc_read_counter(&counters[idx]); + return &counters[idx]; +} + +int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) +{ + bool fast_mode = idx & (1u << 31); + struct kvm_pmc *pmc; + u64 ctr_val; + + pmc = kvm_pmu_msr_idx_to_pmc(vcpu, idx); + if (!pmc) + return 1; + + ctr_val = pmc_read_counter(pmc); if (fast_mode) ctr_val = (u32)ctr_val; -- cgit v1.2.1 From 25462f7f5295e2d3e9c2b31761ac95f0b3c8562f Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jun 2015 15:45:05 +0200 Subject: KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch This patch defines a new function pointer struct (kvm_pmu_ops) to support vPMU for both Intel and AMD. The functions pointers defined in this new struct will be linked with Intel and AMD functions later. In the meanwhile the struct that maps from event_sel bits to PERF_TYPE_HARDWARE events is renamed and moved from Intel specific code to kvm_host.h as a common struct. Reviewed-by: Joerg Roedel Tested-by: Joerg Roedel Signed-off-by: Wei Huang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 + arch/x86/kvm/Makefile | 4 +- arch/x86/kvm/pmu.c | 383 +++++----------------------------------- arch/x86/kvm/pmu.h | 92 ++++++++++ arch/x86/kvm/pmu_amd.c | 97 ++++++++++ arch/x86/kvm/pmu_intel.c | 358 +++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm.c | 3 + arch/x86/kvm/vmx.c | 3 + 8 files changed, 606 insertions(+), 338 deletions(-) create mode 100644 arch/x86/kvm/pmu_amd.c create mode 100644 arch/x86/kvm/pmu_intel.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 534dfa324e35..5a2b4508be44 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -336,6 +336,8 @@ struct kvm_pmu { u64 reprogram_pmi; }; +struct kvm_pmu_ops; + enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, @@ -854,6 +856,8 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); + /* pmu operations of sub-arch */ + const struct kvm_pmu_ops *pmu_ops; }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 470dc6c9d409..67d215cb8953 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,8 +14,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o -kvm-intel-y += vmx.o -kvm-amd-y += svm.o +kvm-intel-y += vmx.o pmu_intel.o +kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index bd5dbd9ce0e3..31aa2c85dc97 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1,11 +1,12 @@ /* * Kernel-based Virtual Machine -- Performance Monitoring Unit support * - * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * Copyright 2015 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity * Gleb Natapov + * Wei Huang * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -21,67 +22,30 @@ #include "lapic.h" #include "pmu.h" -static struct kvm_event_hw_type_mapping arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, -}; - -/* mapping between fixed pmc index and arch_events array */ -static int fixed_pmc_events[] = {1, 0, 7}; - -static bool pmc_is_gp(struct kvm_pmc *pmc) -{ - return pmc->type == KVM_PMC_GP; -} - -static inline u64 pmc_bitmask(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - return pmu->counter_bitmask[pmc->type]; -} - -static inline bool pmc_is_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, - u32 base) -{ - if (msr >= base && msr < base + pmu->nr_arch_gp_counters) - return &pmu->gp_counters[msr - base]; - return NULL; -} - -static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) -{ - int base = MSR_CORE_PERF_FIXED_CTR0; - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) - return &pmu->fixed_counters[msr - base]; - return NULL; -} - -static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) -{ - return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx); -} - -static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) -{ - if (idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); - else - return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); -} +/* NOTE: + * - Each perf counter is defined as "struct kvm_pmc"; + * - There are two types of perf counters: general purpose (gp) and fixed. + * gp counters are stored in gp_counters[] and fixed counters are stored + * in fixed_counters[] respectively. Both of them are part of "struct + * kvm_pmu"; + * - pmu.c understands the difference between gp counters and fixed counters. + * However AMD doesn't support fixed-counters; + * - There are three types of index to access perf counters (PMC): + * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD + * has MSR_K7_PERFCTRn. + * 2. MSR Index (named idx): This normally is used by RDPMC instruction. + * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access + * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except + * that it also supports fixed counters. idx can be used to as index to + * gp and fixed counters. + * 3. Global PMC Index (named pmc): pmc is an index specific to PMU + * code. Each pmc, stored in kvm_pmc.idx field, is unique across + * all perf counters (both gp and fixed). The mapping relationship + * between pmc and perf counters is as the following: + * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters + * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed + * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters + */ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { @@ -132,30 +96,6 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, } } -static u64 pmc_read_counter(struct kvm_pmc *pmc) -{ - u64 counter, enabled, running; - - counter = pmc->counter; - - if (pmc->perf_event) - counter += perf_event_read_value(pmc->perf_event, - &enabled, &running); - - /* FIXME: Scaling needed? */ - - return counter & pmc_bitmask(pmc); -} - -static void pmc_stop_counter(struct kvm_pmc *pmc) -{ - if (pmc->perf_event) { - pmc->counter = pmc_read_counter(pmc); - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - } -} - static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, bool intr, @@ -193,24 +133,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); } -static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select, - u8 unit_mask) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(arch_events); i++) - if (arch_events[i].eventsel == event_select - && arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) - break; - - if (i == ARRAY_SIZE(arch_events)) - return PERF_COUNT_HW_MAX; - - return arch_events[i].event_type; -} - -static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; @@ -233,8 +156,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = find_arch_event(pmc_to_pmu(pmc), event_select, - unit_mask); + config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc), + event_select, + unit_mask); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; } @@ -249,8 +173,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) (eventsel & HSW_IN_TX), (eventsel & HSW_IN_TX_CHECKPOINTED)); } +EXPORT_SYMBOL_GPL(reprogram_gp_counter); -static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { unsigned en_field = ctrl & 0x3; bool pmi = ctrl & 0x8; @@ -261,38 +186,16 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) return; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - arch_events[fixed_pmc_events[idx]].event_type, + kvm_x86_ops->pmu_ops->find_fixed_event(idx), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi, false, false); } +EXPORT_SYMBOL_GPL(reprogram_fixed_counter); -static inline u8 fixed_ctrl_field(u64 ctrl, int idx) -{ - return (ctrl >> (idx * 4)) & 0xf; -} - -static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) +void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { - int i; - - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - u8 new_ctrl = fixed_ctrl_field(data, i); - struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); - - if (old_ctrl == new_ctrl) - continue; - - reprogram_fixed_counter(pmc, new_ctrl, i); - } - - pmu->fixed_ctr_ctrl = data; -} - -static void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) -{ - struct kvm_pmc *pmc = global_idx_to_pmc(pmu, pmc_idx); + struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); if (!pmc) return; @@ -306,17 +209,7 @@ static void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) reprogram_fixed_counter(pmc, ctrl, idx); } } - -static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) -{ - int bit; - u64 diff = pmu->global_ctrl ^ data; - - pmu->global_ctrl = data; - - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); -} +EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -327,7 +220,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) bitmask = pmu->reprogram_pmi; for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); + struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); @@ -341,28 +234,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) /* check if idx is a valid index to access PMU */ int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); - idx &= ~(3u << 30); - return (!fixed && idx >= pmu->nr_arch_gp_counters) || - (fixed && idx >= pmu->nr_arch_fixed_counters); -} - -static struct kvm_pmc *kvm_pmu_msr_idx_to_pmc(struct kvm_vcpu *vcpu, - unsigned idx) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); - struct kvm_pmc *counters; - - idx &= ~(3u << 30); - if (!fixed && idx >= pmu->nr_arch_gp_counters) - return NULL; - if (fixed && idx >= pmu->nr_arch_fixed_counters) - return NULL; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - - return &counters[idx]; + return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx); } int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) @@ -371,7 +243,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) struct kvm_pmc *pmc; u64 ctr_val; - pmc = kvm_pmu_msr_idx_to_pmc(vcpu, idx); + pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx); if (!pmc) return 1; @@ -391,111 +263,17 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int ret; - - switch (msr) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; - break; - default: - ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) - || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) - || get_fixed_pmc(pmu, msr); - break; - } - return ret; + return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); } -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc; - - switch (index) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - *data = pmu->fixed_ctr_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_STATUS: - *data = pmu->global_status; - return 0; - case MSR_CORE_PERF_GLOBAL_CTRL: - *data = pmu->global_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - *data = pmu->global_ovf_ctrl; - return 0; - default: - if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, index))) { - *data = pmc_read_counter(pmc); - return 0; - } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { - *data = pmc->eventsel; - return 0; - } - } - return 1; + return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc; - u32 index = msr_info->index; - u64 data = msr_info->data; - - switch (index) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (pmu->fixed_ctr_ctrl == data) - return 0; - if (!(data & 0xfffffffffffff444ull)) { - reprogram_fixed_counters(pmu, data); - return 0; - } - break; - case MSR_CORE_PERF_GLOBAL_STATUS: - if (msr_info->host_initiated) { - pmu->global_status = data; - return 0; - } - break; /* RO MSR */ - case MSR_CORE_PERF_GLOBAL_CTRL: - if (pmu->global_ctrl == data) - return 0; - if (!(data & pmu->global_ctrl_mask)) { - global_ctrl_changed(pmu, data); - return 0; - } - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; - return 0; - } - break; - default: - if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, index))) { - if (!msr_info->host_initiated) - data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); - return 0; - } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { - if (data == pmc->eventsel) - return 0; - if (!(data & pmu->reserved_bits)) { - reprogram_gp_counter(pmc, data); - return 0; - } - } - } - return 1; + return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info); } /* refresh PMU settings. This function generally is called when underlying @@ -504,90 +282,23 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_cpuid_entry2 *entry; - union cpuid10_eax eax; - union cpuid10_edx edx; - - pmu->nr_arch_gp_counters = 0; - pmu->nr_arch_fixed_counters = 0; - pmu->counter_bitmask[KVM_PMC_GP] = 0; - pmu->counter_bitmask[KVM_PMC_FIXED] = 0; - pmu->version = 0; - pmu->reserved_bits = 0xffffffff00200000ull; - - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) - return; - eax.full = entry->eax; - edx.full = entry->edx; - - pmu->version = eax.split.version_id; - if (!pmu->version) - return; - - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - pmu->available_event_types = ~entry->ebx & - ((1ull << eax.split.mask_length) - 1); - - if (pmu->version == 1) { - pmu->nr_arch_fixed_counters = 0; - } else { - pmu->nr_arch_fixed_counters = - min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << edx.split.bit_width_fixed) - 1; - } - - pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; - - entry = kvm_find_cpuid_entry(vcpu, 7, 0); - if (entry && - (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) - pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; + kvm_x86_ops->pmu_ops->refresh(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int i; irq_work_sync(&pmu->irq_work); - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - struct kvm_pmc *pmc = &pmu->gp_counters[i]; - pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; - } - - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) - pmc_stop_counter(&pmu->fixed_counters[i]); - - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; + kvm_x86_ops->pmu_ops->reset(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) { - int i; struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - pmu->gp_counters[i].type = KVM_PMC_GP; - pmu->gp_counters[i].vcpu = vcpu; - pmu->gp_counters[i].idx = i; - } - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { - pmu->fixed_counters[i].type = KVM_PMC_FIXED; - pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; - } + kvm_x86_ops->pmu_ops->init(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); kvm_pmu_refresh(vcpu); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 19bf0172f93b..f96e1f962587 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -5,12 +5,102 @@ #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) +/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) + struct kvm_event_hw_type_mapping { u8 eventsel; u8 unit_mask; unsigned event_type; }; +struct kvm_pmu_ops { + unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select, + u8 unit_mask); + unsigned (*find_fixed_event)(int idx); + bool (*pmc_is_enabled)(struct kvm_pmc *pmc); + struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); + struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx); + int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx); + bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); + void (*refresh)(struct kvm_vcpu *vcpu); + void (*init)(struct kvm_vcpu *vcpu); + void (*reset)(struct kvm_vcpu *vcpu); +}; + +static inline u64 pmc_bitmask(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + return pmu->counter_bitmask[pmc->type]; +} + +static inline u64 pmc_read_counter(struct kvm_pmc *pmc) +{ + u64 counter, enabled, running; + + counter = pmc->counter; + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, + &enabled, &running); + /* FIXME: Scaling needed? */ + return counter & pmc_bitmask(pmc); +} + +static inline void pmc_stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = pmc_read_counter(pmc); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +static inline bool pmc_is_gp(struct kvm_pmc *pmc) +{ + return pmc->type == KVM_PMC_GP; +} + +static inline bool pmc_is_fixed(struct kvm_pmc *pmc) +{ + return pmc->type == KVM_PMC_FIXED; +} + +static inline bool pmc_is_enabled(struct kvm_pmc *pmc) +{ + return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc); +} + +/* returns general purpose PMC with the specified MSR. Note that it can be + * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a + * paramenter to tell them apart. + */ +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, + u32 base) +{ + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) + return &pmu->gp_counters[msr - base]; + + return NULL; +} + +/* returns fixed PMC with the specified MSR */ +static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) +{ + int base = MSR_CORE_PERF_FIXED_CTR0; + + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) + return &pmu->fixed_counters[msr - base]; + + return NULL; +} + +void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); +void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); +void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); @@ -23,4 +113,6 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +extern struct kvm_pmu_ops intel_pmu_ops; +extern struct kvm_pmu_ops amd_pmu_ops; #endif /* __KVM_X86_PMU_H */ diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c new file mode 100644 index 000000000000..48786407fee1 --- /dev/null +++ b/arch/x86/kvm/pmu_amd.c @@ -0,0 +1,97 @@ +/* + * KVM PMU support for AMD + * + * Copyright 2015, Red Hat, Inc. and/or its affiliates. + * + * Author: + * Wei Huang + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Implementation is based on pmu_intel.c file + */ +#include +#include +#include +#include "x86.h" +#include "cpuid.h" +#include "lapic.h" +#include "pmu.h" + +static unsigned amd_find_arch_event(struct kvm_pmu *pmu, + u8 event_select, + u8 unit_mask) +{ + return PERF_COUNT_HW_MAX; +} + +/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ +static unsigned amd_find_fixed_event(int idx) +{ + return PERF_COUNT_HW_MAX; +} + +static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) +{ + return false; +} + +static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + return NULL; +} + +/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ +static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +{ + return 1; +} + +/* idx is the ECX register of RDPMC instruction */ +static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx) +{ + return NULL; +} + +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + return false; +} + +static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + return 1; +} + +static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + return 1; +} + +static void amd_pmu_refresh(struct kvm_vcpu *vcpu) +{ +} + +static void amd_pmu_init(struct kvm_vcpu *vcpu) +{ +} + +static void amd_pmu_reset(struct kvm_vcpu *vcpu) +{ +} + +struct kvm_pmu_ops amd_pmu_ops = { + .find_arch_event = amd_find_arch_event, + .find_fixed_event = amd_find_fixed_event, + .pmc_is_enabled = amd_pmc_is_enabled, + .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, + .msr_idx_to_pmc = amd_msr_idx_to_pmc, + .is_valid_msr_idx = amd_is_valid_msr_idx, + .is_valid_msr = amd_is_valid_msr, + .get_msr = amd_pmu_get_msr, + .set_msr = amd_pmu_set_msr, + .refresh = amd_pmu_refresh, + .init = amd_pmu_init, + .reset = amd_pmu_reset, +}; diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c new file mode 100644 index 000000000000..ab38af4f4947 --- /dev/null +++ b/arch/x86/kvm/pmu_intel.c @@ -0,0 +1,358 @@ +/* + * KVM PMU support for Intel CPUs + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity + * Gleb Natapov + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include +#include +#include +#include +#include "x86.h" +#include "cpuid.h" +#include "lapic.h" +#include "pmu.h" + +static struct kvm_event_hw_type_mapping intel_arch_events[] = { + /* Index must match CPUID 0x0A.EBX bit vector */ + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, +}; + +/* mapping between fixed pmc index and intel_arch_events array */ +static int fixed_pmc_events[] = {1, 0, 7}; + +static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + u8 new_ctrl = fixed_ctrl_field(data, i); + u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); + struct kvm_pmc *pmc; + + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + + if (old_ctrl == new_ctrl) + continue; + + reprogram_fixed_counter(pmc, new_ctrl, i); + } + + pmu->fixed_ctr_ctrl = data; +} + +/* function is called when global control register has been updated. */ +static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +{ + int bit; + u64 diff = pmu->global_ctrl ^ data; + + pmu->global_ctrl = data; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + reprogram_counter(pmu, bit); +} + +static unsigned intel_find_arch_event(struct kvm_pmu *pmu, + u8 event_select, + u8 unit_mask) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) + if (intel_arch_events[i].eventsel == event_select + && intel_arch_events[i].unit_mask == unit_mask + && (pmu->available_event_types & (1 << i))) + break; + + if (i == ARRAY_SIZE(intel_arch_events)) + return PERF_COUNT_HW_MAX; + + return intel_arch_events[i].event_type; +} + +static unsigned intel_find_fixed_event(int idx) +{ + if (idx >= ARRAY_SIZE(fixed_pmc_events)) + return PERF_COUNT_HW_MAX; + + return intel_arch_events[fixed_pmc_events[idx]].event_type; +} + +/* check if a PMC is enabled by comparising it with globl_ctrl bits. */ +static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } +} + +/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ +static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + + idx &= ~(3u << 30); + + return (!fixed && idx >= pmu->nr_arch_gp_counters) || + (fixed && idx >= pmu->nr_arch_fixed_counters); +} + +static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, + unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + struct kvm_pmc *counters; + + idx &= ~(3u << 30); + if (!fixed && idx >= pmu->nr_arch_gp_counters) + return NULL; + if (fixed && idx >= pmu->nr_arch_fixed_counters) + return NULL; + counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + + return &counters[idx]; +} + +static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int ret; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + ret = pmu->version > 1; + break; + default: + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || + get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || + get_fixed_pmc(pmu, msr); + break; + } + + return ret; +} + +static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + *data = pmu->fixed_ctr_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_STATUS: + *data = pmu->global_status; + return 0; + case MSR_CORE_PERF_GLOBAL_CTRL: + *data = pmu->global_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *data = pmu->global_ovf_ctrl; + return 0; + default: + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, msr))) { + *data = pmc_read_counter(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { + *data = pmc->eventsel; + return 0; + } + } + + return 1; +} + +static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 msr = msr_info->index; + u64 data = msr_info->data; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + if (pmu->fixed_ctr_ctrl == data) + return 0; + if (!(data & 0xfffffffffffff444ull)) { + reprogram_fixed_counters(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + if (msr_info->host_initiated) { + pmu->global_status = data; + return 0; + } + break; /* RO MSR */ + case MSR_CORE_PERF_GLOBAL_CTRL: + if (pmu->global_ctrl == data) + return 0; + if (!(data & pmu->global_ctrl_mask)) { + global_ctrl_changed(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + pmu->global_ovf_ctrl = data; + return 0; + } + break; + default: + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, msr))) { + if (!msr_info->host_initiated) + data = (s64)(s32)data; + pmc->counter += data - pmc_read_counter(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { + if (data == pmc->eventsel) + return 0; + if (!(data & pmu->reserved_bits)) { + reprogram_gp_counter(pmc, data); + return 0; + } + } + } + + return 1; +} + +static void intel_pmu_refresh(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_cpuid_entry2 *entry; + union cpuid10_eax eax; + union cpuid10_edx edx; + + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return; + eax.full = entry->eax; + edx.full = entry->edx; + + pmu->version = eax.split.version_id; + if (!pmu->version) + return; + + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, + INTEL_PMC_MAX_GENERIC); + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + pmu->available_event_types = ~entry->ebx & + ((1ull << eax.split.mask_length) - 1); + + if (pmu->version == 1) { + pmu->nr_arch_fixed_counters = 0; + } else { + pmu->nr_arch_fixed_counters = + min_t(int, edx.split.num_counters_fixed, + INTEL_PMC_MAX_FIXED); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << edx.split.bit_width_fixed) - 1; + } + + pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); + pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; +} + +static void intel_pmu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + } + + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + pmu->fixed_counters[i].type = KVM_PMC_FIXED; + pmu->fixed_counters[i].vcpu = vcpu; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + } +} + +static void intel_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + struct kvm_pmc *pmc = &pmu->gp_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } + + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) + pmc_stop_counter(&pmu->fixed_counters[i]); + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = + pmu->global_ovf_ctrl = 0; +} + +struct kvm_pmu_ops intel_pmu_ops = { + .find_arch_event = intel_find_arch_event, + .find_fixed_event = intel_find_fixed_event, + .pmc_is_enabled = intel_pmc_is_enabled, + .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, + .msr_idx_to_pmc = intel_msr_idx_to_pmc, + .is_valid_msr_idx = intel_is_valid_msr_idx, + .is_valid_msr = intel_is_valid_msr, + .get_msr = intel_pmu_get_msr, + .set_msr = intel_pmu_set_msr, + .refresh = intel_pmu_refresh, + .init = intel_pmu_init, + .reset = intel_pmu_reset, +}; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e7685af399e4..851a9a1c6dfc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -21,6 +21,7 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" +#include "pmu.h" #include #include @@ -4457,6 +4458,8 @@ static struct kvm_x86_ops svm_x86_ops = { .handle_external_intr = svm_handle_external_intr, .sched_in = svm_sched_in, + + .pmu_ops = &amd_pmu_ops, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 44eafdb440c9..e5a379f82672 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -48,6 +48,7 @@ #include #include "trace.h" +#include "pmu.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex_clear(x, reg) \ @@ -10419,6 +10420,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + + .pmu_ops = &intel_pmu_ops, }; static int __init vmx_init(void) -- cgit v1.2.1 From ca724305a2b02abcbecbf22577536fc8e965ab4f Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 12 Jun 2015 01:34:55 -0400 Subject: KVM: x86/vPMU: Implement AMD vPMU code for KVM This patch replaces the empty AMD vPMU functions (in pmu_amd.c) with real implementation. Reviewed-by: Joerg Roedel Tested-by: Joerg Roedel Signed-off-by: Wei Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu_amd.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 116 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 48786407fee1..886aa25a7131 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -19,11 +19,33 @@ #include "lapic.h" #include "pmu.h" +/* duplicated from amd_perfmon_event_map, K7 and above should work. */ +static struct kvm_event_hw_type_mapping amd_event_mapping[] = { + [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES }, + [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, +}; + static unsigned amd_find_arch_event(struct kvm_pmu *pmu, u8 event_select, u8 unit_mask) { - return PERF_COUNT_HW_MAX; + int i; + + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) + if (amd_event_mapping[i].eventsel == event_select + && amd_event_mapping[i].unit_mask == unit_mask) + break; + + if (i == ARRAY_SIZE(amd_event_mapping)) + return PERF_COUNT_HW_MAX; + + return amd_event_mapping[i].event_type; } /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ @@ -32,53 +54,141 @@ static unsigned amd_find_fixed_event(int idx) return PERF_COUNT_HW_MAX; } +/* check if a PMC is enabled by comparing it against global_ctrl bits. Because + * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). + */ static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) { - return false; + return true; } static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { - return NULL; + return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0); } /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) { - return 1; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + idx &= ~(3u << 30); + + return (idx >= pmu->nr_arch_gp_counters); } /* idx is the ECX register of RDPMC instruction */ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx) { - return NULL; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *counters; + + idx &= ~(3u << 30); + if (idx >= pmu->nr_arch_gp_counters) + return NULL; + counters = pmu->gp_counters; + + return &counters[idx]; } static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return false; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int ret = false; + + ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) || + get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + + return ret; } static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + /* MSR_K7_PERFCTRn */ + pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); + if (pmc) { + *data = pmc_read_counter(pmc); + return 0; + } + /* MSR_K7_EVNTSELn */ + pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + if (pmc) { + *data = pmc->eventsel; + return 0; + } + return 1; } static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 msr = msr_info->index; + u64 data = msr_info->data; + + /* MSR_K7_PERFCTRn */ + pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); + if (pmc) { + if (!msr_info->host_initiated) + data = (s64)data; + pmc->counter += data - pmc_read_counter(pmc); + return 0; + } + /* MSR_K7_EVNTSELn */ + pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + if (pmc) { + if (data == pmc->eventsel) + return 0; + if (!(data & pmu->reserved_bits)) { + reprogram_gp_counter(pmc, data); + return 0; + } + } + return 1; } static void amd_pmu_refresh(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; + pmu->reserved_bits = 0xffffffff00200000ull; + /* not applicable to AMD; but clean them to prevent any fall out */ + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->version = 0; + pmu->global_status = 0; } static void amd_pmu_init(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + for (i = 0; i < AMD64_NUM_COUNTERS ; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + } } static void amd_pmu_reset(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + for (i = 0; i < AMD64_NUM_COUNTERS; i++) { + struct kvm_pmc *pmc = &pmu->gp_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } } struct kvm_pmu_ops amd_pmu_ops = { -- cgit v1.2.1 From 6912ac326d3aab9c0774ebec99f60a73fd04a520 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 12 Jun 2015 01:34:56 -0400 Subject: KVM: x86/vPMU: Enable PMU handling for AMD PERFCTRn and EVNTSELn MSRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch enables AMD guest VM to access (R/W) PMU related MSRs, which include PERFCTR[0..3] and EVNTSEL[0..3]. Reviewed-by: Joerg Roedel Tested-by: Joerg Roedel Reviewed-by: Radim Krčmář Signed-off-by: Wei Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 51 +++++++++------------------------------------------ 1 file changed, 9 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c386f0bd1830..613e13a61cb5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2202,36 +2202,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr, data); - /* Performance counters are not protected by a CPUID bit, - * so we should check all of them in the generic path for the sake of - * cross vendor migration. - * Writing a zero into the event select MSRs disables them, - * which we perfectly emulate ;-). Any other value should be at least - * reported, some guests depend on them. - */ - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - if (data != 0) - vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); - break; - /* at least RHEL 4 unconditionally writes to the perfctr registers, - * so we ignore writes to make it happy. - */ - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: - vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); - break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - pr = true; - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: + case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: + pr = true; /* fall through */ + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: + case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -2418,24 +2393,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K8_SYSCFG: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: msr_info->data = 0; break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: + case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); msr_info->data = 0; -- cgit v1.2.1 From 2ae416b142b625c58c9ccb039aa3ef48ad0e9bae Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 24 Jun 2015 16:56:16 -0700 Subject: mm: new mm hook framework CRIU is recreating the process memory layout by remapping the checkpointee memory area on top of the current process (criu). This includes remapping the vDSO to the place it has at checkpoint time. However some architectures like powerpc are keeping a reference to the vDSO base address to build the signal return stack frame by calling the vDSO sigreturn service. So once the vDSO has been moved, this reference is no more valid and the signal frame built later are not usable. This patch serie is introducing a new mm hook framework, and a new arch_remap hook which is called when mremap is done and the mm lock still hold. The next patch is adding the vDSO remap and unmap tracking to the powerpc architecture. This patch (of 3): This patch introduces a new set of header file to manage mm hooks: - per architecture empty header file (arch/x/include/asm/mm-arch-hooks.h) - a generic header (include/linux/mm-arch-hooks.h) The architecture which need to overwrite a hook as to redefine it in its header file, while architecture which doesn't need have nothing to do. The default hooks are defined in the generic header and are used in the case the architecture is not defining it. In a next step, mm hooks defined in include/asm-generic/mm_hooks.h should be moved here. Signed-off-by: Laurent Dufour Suggested-by: Andrew Morton Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/mm-arch-hooks.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 arch/x86/include/asm/mm-arch-hooks.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..4e881a342236 --- /dev/null +++ b/arch/x86/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_X86_MM_ARCH_HOOKS_H +#define _ASM_X86_MM_ARCH_HOOKS_H + +#endif /* _ASM_X86_MM_ARCH_HOOKS_H */ -- cgit v1.2.1 From a67a31fa308a9032ead31b0501dafdb44ccf5a12 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 24 Jun 2015 16:56:25 -0700 Subject: mm/hugetlb: reduce arch dependent code about hugetlb_prefault_arch_hook Currently we have many duplicates in definitions of hugetlb_prefault_arch_hook. In all architectures this function is empty. Signed-off-by: Zhang Zhen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/hugetlb.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 68c05398bba9..dab7a3a750bf 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, -- cgit v1.2.1 From 8809aa2d28d74111ff2f1928edaa4e9845c97a7d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Jun 2015 16:57:44 -0700 Subject: mm: clarify that the function operates on hugepage pte We have confusing functions to clear pmd, pmd_clear_* and pmd_clear. Add _huge_ to pmdp_clear functions so that we are clear that they operate on hugepage pte. We don't bother about other functions like pmdp_set_wrprotect, pmdp_clear_flush_young, because they operate on PTE bits and hence indicate they are operating on hugepage ptes Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Andrea Arcangeli Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2562e303405b..867da5bbb4a3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -805,8 +805,8 @@ static inline int pmd_write(pmd_t pmd) return pmd_flags(pmd) & _PAGE_RW; } -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = native_pmdp_get_and_clear(pmdp); -- cgit v1.2.1 From fc6daaf93151877748f8096af6b3fddb147f22d6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:09 -0700 Subject: mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute Some high end Intel Xeon systems report uncorrectable memory errors as a recoverable machine check. Linux has included code for some time to process these and just signal the affected processes (or even recover completely if the error was in a read only page that can be replaced by reading from disk). But we have no recovery path for errors encountered during kernel code execution. Except for some very specific cases were are unlikely to ever be able to recover. Enter memory mirroring. Actually 3rd generation of memory mirroing. Gen1: All memory is mirrored Pro: No s/w enabling - h/w just gets good data from other side of the mirror Con: Halves effective memory capacity available to OS/applications Gen2: Partial memory mirror - just mirror memory begind some memory controllers Pro: Keep more of the capacity Con: Nightmare to enable. Have to choose between allocating from mirrored memory for safety vs. NUMA local memory for performance Gen3: Address range partial memory mirror - some mirror on each memory controller Pro: Can tune the amount of mirror and keep NUMA performance Con: I have to write memory management code to implement The current plan is just to use mirrored memory for kernel allocations. This has been broken into two phases: 1) This patch series - find the mirrored memory, use it for boot time allocations 2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the unused mirrored memory from mm/memblock.c and only give it out to select kernel allocations (this is still being scoped because page_alloc.c is scary). This patch (of 3): Add extra "flags" to memblock to allow selection of memory based on attribute. No functional changes Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/check.c | 3 ++- arch/x86/kernel/e820.c | 3 ++- arch/x86/mm/init_32.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 83a7995625a6..58118e207a69 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) { start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), PAGE_SIZE, corruption_check_size); end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e2ce85db2283..c8dda42cb6a3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void) nr_pages += end_pfn - start_pfn; } - for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) { start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); if (start_pfn < end_pfn) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c8140e12816a..8340e45c891a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid, phys_addr_t start, end; u64 i; - for_each_free_mem_range(i, nid, &start, &end, NULL) { + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) { unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), start_pfn, end_pfn); unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), -- cgit v1.2.1 From b05b9f5f9dcf593a0e9327676b78e6c17b4218e8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:15 -0700 Subject: x86, mirror: x86 enabling - find mirrored memory ranges UEFI GetMemoryMap() uses a new attribute bit to mark mirrored memory address ranges. See UEFI 2.5 spec pages 157-158: http://www.uefi.org/sites/default/files/resources/UEFI%202_5.pdf On EFI enabled systems scan the memory map and tell memblock about any mirrored ranges. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 3 +++ arch/x86/platform/efi/efi.c | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 39ca113676fe..d3b95b89e9b2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1105,6 +1105,9 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); + if (efi_enabled(EFI_BOOT)) + efi_find_mirror(); + /* * The EFI specification says that boot service code won't be called * after ExitBootServices(). This is, in fact, a lie. diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 3b984c3aa1b0..c1c382c58c60 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now) now->tv_nsec = 0; } +void __init efi_find_mirror(void) +{ + void *p; + u64 mirror_size = 0, total_size = 0; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + + total_size += size; + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + memblock_mark_mirror(start, size); + mirror_size += size; + } + } + if (mirror_size) + pr_info("Memory: %lldM/%lldM mirrored memory\n", + mirror_size>>20, total_size>>20); +} + /* * Tell the kernel about the EFI memory map. This might include * more than the max 128 entries that can fit in the e820 legacy -- cgit v1.2.1 From 9f53f9fa4ad1d8bddd4d14359cdabc531aedffe8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 9 Jun 2015 15:33:45 -0400 Subject: libnvdimm, pmem: add libnvdimm support to the pmem driver nd_pmem attaches to persistent memory regions and namespaces emitted by the libnvdimm subsystem, and, same as the original pmem driver, presents the system-physical-address range as a block device. The existing e820-type-12 to pmem setup is converted to an nvdimm_bus that emits an nd_namespace_io device. Note that the X in 'pmemX' is now derived from the parent region. This provides some stability to the pmem devices names from boot-to-boot. The minor numbers are also more predictable by passing 0 to alloc_disk(). Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Signed-off-by: Ross Zwisler Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- arch/x86/Kconfig | 3 ++ arch/x86/kernel/pmem.c | 92 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 63 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..1a2cbf641667 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1424,6 +1424,9 @@ source "mm/Kconfig" config X86_PMEM_LEGACY bool "Support non-standard NVDIMMs and ADR protected memory" + depends on PHYS_ADDR_T_64BIT + depends on BLK_DEV + select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used by the Intel Sandy Bridge-EP reference BIOS as protected memory. diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 3420c874ddc5..0f4ef472ab9e 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -1,53 +1,81 @@ /* * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. */ -#include #include -#include +#include +#include #include -#include -#include -static __init void register_pmem_device(struct resource *res) +static void e820_pmem_release(struct device *dev) { - struct platform_device *pdev; - int error; + struct nvdimm_bus *nvdimm_bus = dev->platform_data; - pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO); - if (!pdev) - return; + if (nvdimm_bus) + nvdimm_bus_unregister(nvdimm_bus); +} - error = platform_device_add_resources(pdev, res, 1); - if (error) - goto out_put_pdev; +static struct platform_device e820_pmem = { + .name = "e820_pmem", + .id = -1, + .dev = { + .release = e820_pmem_release, + }, +}; - error = platform_device_add(pdev); - if (error) - goto out_put_pdev; - return; +static const struct attribute_group *e820_pmem_attribute_groups[] = { + &nvdimm_bus_attribute_group, + NULL, +}; -out_put_pdev: - dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n"); - platform_device_put(pdev); -} +static const struct attribute_group *e820_pmem_region_attribute_groups[] = { + &nd_region_attribute_group, + &nd_device_attribute_group, + NULL, +}; -static __init int register_pmem_devices(void) +static __init int register_e820_pmem(void) { - int i; + static struct nvdimm_bus_descriptor nd_desc; + struct device *dev = &e820_pmem.dev; + struct nvdimm_bus *nvdimm_bus; + int rc, i; + + rc = platform_device_register(&e820_pmem); + if (rc) + return rc; + + nd_desc.attr_groups = e820_pmem_attribute_groups; + nd_desc.provider_name = "e820"; + nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); + if (!nvdimm_bus) + goto err; + dev->platform_data = nvdimm_bus; for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; + struct resource res = { + .flags = IORESOURCE_MEM, + .start = ei->addr, + .end = ei->addr + ei->size - 1, + }; + struct nd_region_desc ndr_desc; + + if (ei->type != E820_PRAM) + continue; - if (ei->type == E820_PRAM) { - struct resource res = { - .flags = IORESOURCE_MEM, - .start = ei->addr, - .end = ei->addr + ei->size - 1, - }; - register_pmem_device(&res); - } + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.res = &res; + ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + goto err; } return 0; + + err: + dev_err(dev, "failed to register legacy persistent memory ranges\n"); + platform_device_unregister(&e820_pmem); + return -ENXIO; } -device_initcall(register_pmem_devices); +device_initcall(register_e820_pmem); -- cgit v1.2.1 From da028d5e5463dabb6ede2f5e3f6cced1283988cc Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 25 Jun 2015 22:44:11 +0200 Subject: um: Don't pollute kernel namespace with uapi Don't include ptrace uapi stuff in arch headers, it will pollute the kernel namespace and conflict with existing stuff. In this case it fixes clashes with common names like R8. Signed-off-by: Richard Weinberger --- arch/x86/um/ptrace_32.c | 1 + arch/x86/um/ptrace_64.c | 1 + arch/x86/um/tls_32.c | 1 + arch/x86/um/tls_64.c | 1 + 4 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index ce3dd4f36f3f..a29756f2d940 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -6,6 +6,7 @@ #include #include #include +#include #include extern int arch_switch_tls(struct task_struct *to); diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 3b52bf0b418a..a629694ee750 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -11,6 +11,7 @@ #define __FRAME_OFFSETS #include #include +#include /* * determines which flags the user has access to. diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index 80ffa5b9982d..48e38584d5c1 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c index d22363cb854e..3ad714373d7f 100644 --- a/arch/x86/um/tls_64.c +++ b/arch/x86/um/tls_64.c @@ -1,4 +1,5 @@ #include +#include void clear_flushed_tls(struct task_struct *task) { -- cgit v1.2.1 From 08bd4fc15683b9a26b9be7048d151f4ddadad96f Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Thu, 25 Jun 2015 14:59:44 -0700 Subject: mm/hugetlb: remove arch_prepare/release_hugepage from arch headers Nobody used these hooks so they were removed from common code, and can now be removed from the architectures. Signed-off-by: Dominik Dingel Acked-by: Martin Schwidefsky Acked-by: Ralf Baechle Cc: Heiko Carstens Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/hugetlb.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index dab7a3a750bf..f8a29d2c97b0 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -80,15 +80,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return *ptep; } -static inline int arch_prepare_hugepage(struct page *page) -{ - return 0; -} - -static inline void arch_release_hugepage(struct page *page) -{ -} - static inline void arch_clear_hugepage_flags(struct page *page) { } -- cgit v1.2.1 From 41d7a6d637e1440f5410cb43c25a3c41255540c5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:33 -0600 Subject: libnvdimm: Set numa_node to NVDIMM devices ACPI NFIT table has System Physical Address Range Structure entries that describe a proximity ID of each range when ACPI_NFIT_PROXIMITY_VALID is set in the flags. Change acpi_nfit_register_region() to map a proximity ID to its node ID, and set it to a new numa_node field of nd_region_desc, which is then conveyed to the nd_region device. The device core arranges for btt and namespace devices to inherit their node from their parent region. Signed-off-by: Toshi Kani [djbw: move set_dev_node() from region.c to bus.c] Signed-off-by: Dan Williams --- arch/x86/kernel/pmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 0f4ef472ab9e..64f90f53bb85 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -67,6 +67,7 @@ static __init int register_e820_pmem(void) memset(&ndr_desc, 0, sizeof(ndr_desc)); ndr_desc.res = &res; ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + ndr_desc.numa_node = NUMA_NO_NODE; if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) goto err; } -- cgit v1.2.1 From 61031952f4c89dba1065f7a5b9419badb112554c Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 03:08:39 -0400 Subject: arch, x86: pmem api for ensuring durability of persistent memory updates Based on an original patch by Ross Zwisler [1]. Writes to persistent memory have the potential to be posted to cpu cache, cpu write buffers, and platform write buffers (memory controller) before being committed to persistent media. Provide apis, memcpy_to_pmem(), wmb_pmem(), and memremap_pmem(), to write data to pmem and assert that it is durable in PMEM (a persistent linear address range). A '__pmem' attribute is added so sparse can track proper usage of pointers to pmem. This continues the status quo of pmem being x86 only for 4.2, but reworks to ioremap, and wider implementation of memremap() will enable other archs in 4.3. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Ross Zwisler [djbw: various reworks] Signed-off-by: Dan Williams --- arch/x86/Kconfig | 1 + arch/x86/include/asm/cacheflush.h | 72 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/io.h | 6 ++++ 3 files changed, 79 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1a2cbf641667..62564ddf7f78 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,6 +27,7 @@ config X86 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_PMEM_API select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 47c8e32f621a..ec23bb753a3e 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -4,6 +4,7 @@ /* Caches aren't brain-dead on the intel. */ #include #include +#include /* * The set_memory_* API can be used to change various attributes of a virtual @@ -104,4 +105,75 @@ static inline int rodata_test(void) } #endif +#ifdef ARCH_HAS_NOCACHE_UACCESS + +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + int unwritten; + + /* + * We are copying between two kernel buffers, if + * __copy_from_user_inatomic_nocache() returns an error (page + * fault) we would have already reported a general protection fault + * before the WARN+BUG. + */ + unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, + (void __user *) src, n); + if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, unwritten)) + BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ + /* + * wmb() to 'sfence' all previous writes such that they are + * architecturally visible to 'pcommit'. Note, that we've + * already arranged for pmem writes to avoid the cache via + * arch_memcpy_to_pmem(). + */ + wmb(); + pcommit_sfence(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ +#ifdef CONFIG_X86_64 + /* + * We require that wmb() be an 'sfence', that is only guaranteed on + * 64-bit builds + */ + return static_cpu_has(X86_FEATURE_PCOMMIT); +#else + return false; +#endif +} +#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ +extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); +extern void arch_wmb_pmem(void); + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} +#endif + #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 34a5b93704d3..c60c3f3b0183 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -247,6 +247,12 @@ static inline void flush_write_buffers(void) #endif } +static inline void __pmem *arch_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return (void __force __pmem *) ioremap_cache(offset, size); +} + #endif /* __KERNEL__ */ extern void native_io_delay(void); -- cgit v1.2.1 From 0fbafd06bdde938884f7326548d3df812b267c3c Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Sat, 27 Jun 2015 15:56:38 +0900 Subject: crypto: aesni - fix failing setkey for rfc4106-gcm-aesni rfc4106(gcm(aes)) uses ctr(aes) to generate hash key. ctr(aes) needs chainiv, but the chainiv gets initialized after aesni_intel when both are statically linked so the setkey fails. This patch forces aesni_intel to be initialized after chainiv. Signed-off-by: Tadeusz Struk Tested-by: Linus Torvalds Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2bfc8a7c88c1..dccad38b59a8 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1537,7 +1537,7 @@ static void __exit aesni_exit(void) crypto_fpu_exit(); } -module_init(aesni_init); +late_initcall(aesni_init); module_exit(aesni_exit); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); -- cgit v1.2.1 From 0a8b83530b6f67b9a50bd7937d57a5deea187b5b Mon Sep 17 00:00:00 2001 From: "qipeng.zha" Date: Sat, 27 Jun 2015 00:32:15 +0800 Subject: intel_pmc_ipc: Add Intel Apollo Lake PMC IPC driver This driver provides support for PMC control on Apollo Lake platforms. The PMC is an ARC processor which defines some IPC commands for communication with other entities in the CPU. Signed-off-by: qipeng.zha [fengguang.wu@intel.com: Fix Sparse and Cocinelle warnings] Signed-off-by: Fengguang Wu Signed-off-by: Darren Hart --- arch/x86/include/asm/intel_pmc_ipc.h | 82 ++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 arch/x86/include/asm/intel_pmc_ipc.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h new file mode 100644 index 000000000000..200ec2e7821d --- /dev/null +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -0,0 +1,82 @@ +#ifndef _ASM_X86_INTEL_PMC_IPC_H_ +#define _ASM_X86_INTEL_PMC_IPC_H_ + +/* Commands */ +#define PMC_IPC_PMIC_ACCESS 0xFF +#define PMC_IPC_PMIC_ACCESS_READ 0x0 +#define PMC_IPC_PMIC_ACCESS_WRITE 0x1 +#define PMC_IPC_USB_PWR_CTRL 0xF0 +#define PMC_IPC_PMIC_BLACKLIST_SEL 0xEF +#define PMC_IPC_PHY_CONFIG 0xEE +#define PMC_IPC_NORTHPEAK_CTRL 0xED +#define PMC_IPC_PM_DEBUG 0xEC +#define PMC_IPC_PMC_TELEMTRY 0xEB +#define PMC_IPC_PMC_FW_MSG_CTRL 0xEA + +/* IPC return code */ +#define IPC_ERR_NONE 0 +#define IPC_ERR_CMD_NOT_SUPPORTED 1 +#define IPC_ERR_CMD_NOT_SERVICED 2 +#define IPC_ERR_UNABLE_TO_SERVICE 3 +#define IPC_ERR_CMD_INVALID 4 +#define IPC_ERR_CMD_FAILED 5 +#define IPC_ERR_EMSECURITY 6 +#define IPC_ERR_UNSIGNEDKERNEL 7 + +#if IS_ENABLED(CONFIG_INTEL_PMC_IPC) + +/* + * intel_pmc_ipc_simple_command + * @cmd: command + * @sub: sub type + */ +int intel_pmc_ipc_simple_command(int cmd, int sub); + +/* + * intel_pmc_ipc_raw_cmd + * @cmd: command + * @sub: sub type + * @in: input data + * @inlen: input length in bytes + * @out: output data + * @outlen: output length in dwords + * @sptr: data writing to SPTR register + * @dptr: data writing to DPTR register + */ +int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen, + u32 *out, u32 outlen, u32 dptr, u32 sptr); + +/* + * intel_pmc_ipc_command + * @cmd: command + * @sub: sub type + * @in: input data + * @inlen: input length in bytes + * @out: output data + * @outlen: output length in dwords + */ +int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, + u32 *out, u32 outlen); + +#else + +static inline int intel_pmc_ipc_simple_command(int cmd, int sub) +{ + return -EINVAL; +} + +static inline int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen, + u32 *out, u32 outlen, u32 dptr, u32 sptr) +{ + return -EINVAL; +} + +static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, + u32 *out, u32 outlen) +{ + return -EINVAL; +} + +#endif /*CONFIG_INTEL_PMC_IPC*/ + +#endif -- cgit v1.2.1 From db52ef74b35dcb91fd154fa52c618bdd1b90e28e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 27 Jun 2015 10:25:14 +0200 Subject: x86/fpu: Fix FPU related boot regression when CPUID masking BIOS feature is enabled Mike Galbraith reported: " My i7-4790 box is having one hell of a time with this merge window, dead in the water. BIOS setting "Limit CPUID Maximum" upsets new fpu code mightily. " It turns out that Linux does a double workaround here, as per: 066941bd4eeb ("x86: unmask CPUID levels on Intel CPUs") it undoes the BIOS workaround - but as a side effect the CPUID state is not completely constant during early init anymore, and the new FPU init code did not take this into account. So what happened is that the xstate init code did not have full CPUID available, which broke subsequent attempts to use xstate features. Fix this by ordering the early FPU init code to after we've stabilized the CPUID state. Reported-bisected-and-tested-by: Mike Galbraith Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150627082514.GA10894@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9fc5e3d9d9c8..922c5e0cea4c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -742,7 +742,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); - fpu__init_system(c); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -754,6 +753,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_bsp_init(c); setup_force_cpu_cap(X86_FEATURE_ALWAYS); + fpu__init_system(c); } void __init early_cpu_init(void) -- cgit v1.2.1 From 93472aff802fd7b61f2209335207e9bd793012f7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Jun 2015 16:47:50 +0200 Subject: perf/x86: Fix 'active_events' imbalance Commit 1b7b938f1817 ("perf/x86/intel: Fix PMI handling for Intel PT") conditionally increments active_events in x86_add_exclusive() but unconditionally decrements in x86_del_exclusive(). These extra decrements can lead to the situation where active_events is zero and thus the PMI handler is 'disabled' while we have active events on the PMU generating PMIs. This leads to a truckload of: Uhhuh. NMI received for unknown reason 21 on CPU 28. Do you have a strange power saving mode enabled? Dazed and confused, but trying to continue messages and generally messes up perf. Remove the condition on the increment, double increment balanced by a double decrement is perfectly fine. Restructure the code a little bit to make the unconditional inc a bit more natural. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: alexander.shishkin@linux.intel.com Cc: brgerst@gmail.com Cc: dvlasenk@redhat.com Cc: luto@amacapital.net Cc: oleg@redhat.com Fixes: 1b7b938f1817 ("perf/x86/intel: Fix PMI handling for Intel PT") Link: http://lkml.kernel.org/r/20150624144750.GJ18673@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5801a14f7524..3658de47900f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -357,34 +357,24 @@ void x86_release_hardware(void) */ int x86_add_exclusive(unsigned int what) { - int ret = -EBUSY, i; - - if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) - return 0; + int i; - mutex_lock(&pmc_reserve_mutex); - for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { - if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) - goto out; + if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { + mutex_lock(&pmc_reserve_mutex); + for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { + if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) + goto fail_unlock; + } + atomic_inc(&x86_pmu.lbr_exclusive[what]); + mutex_unlock(&pmc_reserve_mutex); } - atomic_inc(&x86_pmu.lbr_exclusive[what]); - ret = 0; + atomic_inc(&active_events); + return 0; -out: +fail_unlock: mutex_unlock(&pmc_reserve_mutex); - - /* - * Assuming that all exclusive events will share the PMI handler - * (which checks active_events for whether there is work to do), - * we can bump active_events counter right here, except for - * x86_lbr_exclusive_lbr events that go through x86_pmu_event_init() - * path, which already bumps active_events for them. - */ - if (!ret && what != x86_lbr_exclusive_lbr) - atomic_inc(&active_events); - - return ret; + return -EBUSY; } void x86_del_exclusive(unsigned int what) -- cgit v1.2.1 From 3b242c66ccbd60cf47ab0e8992119d9617548c23 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:13 -0700 Subject: x86: mm: enable deferred struct page initialisation on x86-64 Subject says it all. Other architectures may enable on a case-by-case basis after auditing early_pfn_to_nid and testing. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d05a42357ef0..47365c7b9f47 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86 select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_INT128 if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP -- cgit v1.2.1 From a846f47962e4643b33847c9a0b2b6387f02a93be Mon Sep 17 00:00:00 2001 From: KarimAllah Ahmed Date: Tue, 30 Jun 2015 14:57:39 -0700 Subject: x86/kexec: prepend elfcorehdr instead of appending it to the crash-kernel command-line. Any parameter passed after '--' in the kernel command-line will not be parsed by the kernel at all, instead it will be passed directly to init process. Currently the kernel appends elfcorehdr= to the cmdline passed from kexec load, and if this command-line is used to pass parameters to init process this means that 'elfcorehdr' will not be parsed as a kernel parameter at all which will be a problem for vmcore subsystem since it will know nothing about the location of the ELF structure! Prepending 'elfcorehdr' instead of appending it fixes this problem since it ensures that it always comes before '--' and so it's always parsed as a kernel command-line parameter. Even with this patch things can still go wrong if 'CONFIG_CMDLINE' was also used to embedd a command-line to the crash dump kernel and this command-line contains '--' since the current behavior of the kernel is to actually append the boot loader command-line to the embedded command-line. Signed-off-by: KarimAllah Ahmed Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Acked-by: Vivek Goyal Cc: Haren Myneni Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/kexec-bzimage64.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index ca05f86481aa..ca83f7ac388b 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -72,15 +72,16 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, unsigned long cmdline_len) { char *cmdline_ptr = ((char *)params) + cmdline_offset; - unsigned long cmdline_ptr_phys, len; + unsigned long cmdline_ptr_phys, len = 0; uint32_t cmdline_low_32, cmdline_ext_32; - memcpy(cmdline_ptr, cmdline, cmdline_len); if (image->type == KEXEC_TYPE_CRASH) { - len = sprintf(cmdline_ptr + cmdline_len - 1, - " elfcorehdr=0x%lx", image->arch.elf_load_addr); - cmdline_len += len; + len = sprintf(cmdline_ptr, + "elfcorehdr=0x%lx ", image->arch.elf_load_addr); } + memcpy(cmdline_ptr + len, cmdline, cmdline_len); + cmdline_len += len; + cmdline_ptr[cmdline_len - 1] = '\0'; pr_debug("Final command line is: %s\n", cmdline_ptr); -- cgit v1.2.1 From c1bd55f922a2df6038060ed48a82d146d301ca12 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Tue, 30 Jun 2015 15:00:00 -0700 Subject: x86: opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit For 32-bit userspace on a 64-bit kernel, this requires modifying stub32_clone to actually swap the appropriate arguments to match CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls broken. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett Acked-by: Andy Lutomirski Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thiago Macieira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/kernel/process_32.c | 6 +++--- arch/x86/kernel/process_64.c | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 47365c7b9f47..55bced17dc95 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING if X86_64 + select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c09c99ccf3e3..f73c962fe636 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -128,8 +128,8 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; @@ -184,7 +184,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); + (struct user_desc __user *)tls, 0); if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 843f92e4c711..71d7849a07f7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -150,8 +150,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(&t->thread.tls_array[tls]); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { int err; struct pt_regs *childregs; @@ -207,10 +207,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, #ifdef CONFIG_IA32_EMULATION if (is_ia32_task()) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); + (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + err = do_arch_prctl(p, ARCH_SET_FS, tls); if (err) goto out; } -- cgit v1.2.1 From 42720138b06301cc8a7ee8a495a6d021c4b6a9bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Wed, 1 Jul 2015 15:31:49 +0200 Subject: KVM: x86: make vapics_in_nmi_mode atomic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Writes were a bit racy, but hard to turn into a bug at the same time. (Particularly because modern Linux doesn't use this feature anymore.) Signed-off-by: Radim Krčmář [Actually the next patch makes it much, much easier to trigger the race so I'm including this one for stable@ as well. - Paolo] Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/lapic.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c7fa57b529d2..2a7f5d782c33 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -607,7 +607,7 @@ struct kvm_arch { struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; - int vapics_in_nmi_mode; + atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; struct kvm_apic_map *apic_map; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4dce6f8b6129..f90952f64e79 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -305,7 +305,7 @@ static void pit_do_work(struct kthread_work *work) * LVT0 to NMI delivery. Other PIC interrupts are just sent to * VCPU0, and only if its LVT0 is in EXTINT mode. */ - if (kvm->arch.vapics_in_nmi_mode > 0) + if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0) kvm_for_each_vcpu(i, vcpu, kvm) kvm_apic_nmi_wd_deliver(vcpu); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 36e9de1b4127..607a56b35327 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1263,10 +1263,10 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) if (!nmi_wd_enabled) { apic_debug("Receive NMI setting on APIC_LVT0 " "for cpu %d\n", apic->vcpu->vcpu_id); - apic->vcpu->kvm->arch.vapics_in_nmi_mode++; + atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); } } else if (nmi_wd_enabled) - apic->vcpu->kvm->arch.vapics_in_nmi_mode--; + atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); } static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) -- cgit v1.2.1 From db1385624c686fe99fe2d1b61a36e1537b915d08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 30 Jun 2015 22:19:17 +0200 Subject: KVM: x86: properly restore LVT0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy NMI watchdog didn't work after migration/resume, because vapics_in_nmi_mode was left at 0. Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 607a56b35327..e0f721bfabcb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1822,6 +1822,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); apic_update_lvtt(apic); + apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; -- cgit v1.2.1 From 59fd132340b3e37b83179d2fcb673980035edf62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 30 Jun 2015 22:19:16 +0200 Subject: KVM: x86: keep track of LVT0 changes under APICv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memory-mapped LVT0 register already contains the new value when APICv traps so we can't directly detect a change. Memorize a bit we are interested in to enable legacy NMI watchdog. Suggested-by: Yoshida Nobuo Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 14 ++++++++------ arch/x86/kvm/lapic.h | 1 + 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e0f721bfabcb..954e98a8c2e3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1257,16 +1257,17 @@ static void start_apic_timer(struct kvm_lapic *apic) static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) { - int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0)); + bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val); - if (apic_lvt_nmi_mode(lvt0_val)) { - if (!nmi_wd_enabled) { + if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) { + apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode; + if (lvt0_in_nmi_mode) { apic_debug("Receive NMI setting on APIC_LVT0 " "for cpu %d\n", apic->vcpu->vcpu_id); atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); - } - } else if (nmi_wd_enabled) - atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); + } else + atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); + } } static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) @@ -1597,6 +1598,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED)) apic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); + apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); apic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f2f4e10ab772..71952748222a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -26,6 +26,7 @@ struct kvm_lapic { struct kvm_vcpu *vcpu; bool sw_enabled; bool irr_pending; + bool lvt0_in_nmi_mode; /* Number of bits set in ISR. */ s16 isr_count; /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ -- cgit v1.2.1 From b0996ae48285364710bce812e70ce67771ea6ef7 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Mon, 29 Jun 2015 18:39:23 +0800 Subject: KVM: x86: remove data variable from kvm_get_msr_common Commit 609e36d372ad ("KVM: x86: pass host_initiated to functions that read MSRs") modified kvm_get_msr_common function to use msr_info->data instead of data but missed one occurrence. Replace it and remove the unused local variable. Fixes: 609e36d372ad ("KVM: x86: pass host_initiated to functions that read MSRs") Signed-off-by: Nicolas Iooss Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ac165c2fb8e5..bbaf44e8f0d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2379,8 +2379,6 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - u64 data; - switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: @@ -2453,7 +2451,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* TSC increment by tick */ msr_info->data = 1000ULL; /* CPU multiplier */ - data |= (((uint64_t)4ULL) << 40); + msr_info->data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: msr_info->data = vcpu->arch.efer; -- cgit v1.2.1 From a88464a8b0ffb2f8dfb69d3ab982169578b50f22 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Thu, 2 Jul 2015 19:07:46 +0300 Subject: kvm: add hyper-v crash msrs values Added Hyper-V crash msrs values - HV_X64_MSR_CRASH*. Signed-off-by: Andrey Smetanin Signed-off-by: Denis V. Lunev Reviewed-by: Peter Hornyack CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/hyperv.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index ce6068dbcfbc..8fba544e9cc4 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -199,6 +199,17 @@ #define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 #define HV_X64_MSR_STIMER3_COUNT 0x400000B7 +/* Hyper-V guest crash notification MSR's */ +#define HV_X64_MSR_CRASH_P0 0x40000100 +#define HV_X64_MSR_CRASH_P1 0x40000101 +#define HV_X64_MSR_CRASH_P2 0x40000102 +#define HV_X64_MSR_CRASH_P3 0x40000103 +#define HV_X64_MSR_CRASH_P4 0x40000104 +#define HV_X64_MSR_CRASH_CTL 0x40000105 +#define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63) +#define HV_X64_MSR_CRASH_PARAMS \ + (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) + #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ -- cgit v1.2.1 From b96fecbfa8c88b057e2bbf10021521c232bb3650 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 4 Jul 2015 09:58:19 +0200 Subject: x86/fpu: Fix boot crash in the early FPU code Jan Kara and Thomas Gleixner reported boot crashes in the FPU code: general protection fault: 0000 [#1] SMP RIP: 0010:[] [] mxcsr_feature_mask_init+0x1c/0x40 2b:* 0f ae 85 00 fe ff ff fxsave -0x200(%rbp) and bisected it down to the following FPU commit: 91a8c2a5b43f ("x86/fpu: Clean up and fix MXCSR handling") The reason is that the on-stack FPU registers state variable, used by the FXSAVE instruction, did not have the required minimum alignment of 16 bytes, causing the general protection fault. This is most likely a GCC bug in older GCC versions, but the offending commit also added a bogus extra 32-byte alignment (which GCC ignored too). So fix this bug by making the variable static again, but also mark it __initdata this time, because fpu__init_system_mxcsr() is now an __init function. Reported-and-bisected-by: Jan Kara Reported-bisected-and-tested-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Jan Kara Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150704075819.GA9201@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index fc878fee6a51..32826791e675 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -95,11 +95,12 @@ static void __init fpu__init_system_mxcsr(void) unsigned int mask = 0; if (cpu_has_fxsr) { - struct fxregs_state fx_tmp __aligned(32) = { }; + /* Static because GCC does not get 16-byte stack alignment right: */ + static struct fxregs_state fxregs __initdata; - asm volatile("fxsave %0" : "+m" (fx_tmp)); + asm volatile("fxsave %0" : "+m" (fxregs)); - mask = fx_tmp.mxcsr_mask; + mask = fxregs.mxcsr_mask; /* * If zero then use the default features mask, -- cgit v1.2.1 From 5aac644a9944bea93b4f05ced1883a902a2535f6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 3 Jun 2015 10:39:46 +0300 Subject: x86/tsc: Let high latency PIT fail fast in quick_pit_calibrate() If it takes longer than 12us to read the PIT counter lsb/msb, then the error margin will never fall below 500ppm within 50ms, and Fast TSC calibration will always fail. This patch detects when that will happen and fails fast. Note the failure message is not printed in that case because: 1. it will always happen on that class of hardware 2. the absence of the message is more informative than its presence Signed-off-by: Adrian Hunter Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Len Brown Cc: Andi Kleen Link: http://lkml.kernel.org/r/556EB717.9070607@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 505449700e0c..7437b41f6a47 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -598,10 +598,19 @@ static unsigned long quick_pit_calibrate(void) if (!pit_expect_msb(0xff-i, &delta, &d2)) break; + delta -= tsc; + + /* + * Extrapolate the error and fail fast if the error will + * never be below 500 ppm. + */ + if (i == 1 && + d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) + return 0; + /* * Iterate until the error is less than 500 ppm */ - delta -= tsc; if (d1+d2 >= delta >> 11) continue; -- cgit v1.2.1 From ebf2d2689de551d90965090bb991fc640a0c0d41 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Mon, 22 Jun 2015 21:38:43 +0200 Subject: perf/x86: Fix copy_from_user_nmi() return if range is not ok Commit 0a196848ca36 ("perf: Fix arch_perf_out_copy_user default"), changes copy_from_user_nmi() to return the number of remaining bytes so that it behave like copy_from_user(). Unfortunately, when the range is outside of the process memory, the return value is still the number of byte copied, eg. 0, instead of the remaining bytes. As all users of copy_from_user_nmi() were modified as part of commit 0a196848ca36, the function should be fixed to return the total number of bytes if range is not correct. Signed-off-by: Yann Droneaud Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435001923-30986-1-git-send-email-ydroneaud@opteya.com Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index ddf9ecb53cc3..e342586db6e4 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -20,7 +20,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) unsigned long ret; if (__range_not_ok(from, n, TASK_SIZE)) - return 0; + return n; /* * Even though this function is typically called from NMI/IRQ context -- cgit v1.2.1 From d0f77d4d04b222a817925d33ba3589b190bfa863 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:33 +0300 Subject: x86/init: Clear 'init_level4_pgt' earlier Currently x86_64_start_kernel() has two KASAN related function calls. The first call maps shadow to early_level4_pgt, the second maps shadow to init_level4_pgt. If we move clear_page(init_level4_pgt) earlier, we could hide KASAN low level detail from generic x86_64 initialization code. The next patch will do it. Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-2-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5a4668136e98..65c8985e3eb2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -166,6 +166,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); + clear_page(init_level4_pgt); + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); @@ -177,7 +179,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - clear_page(init_level4_pgt); /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; -- cgit v1.2.1 From 5d5aa3cfca5cf74cd928daf3674642e6004328d1 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Thu, 2 Jul 2015 12:09:34 +0300 Subject: x86/kasan: Fix KASAN shadow region page tables Currently KASAN shadow region page tables created without respect of physical offset (phys_base). This causes kernel halt when phys_base is not zero. So let's initialize KASAN shadow region page tables in kasan_early_init() using __pa_nodebug() which considers phys_base. This patch also separates x86_64_start_kernel() from KASAN low level details by moving kasan_map_early_shadow(init_level4_pgt) into kasan_early_init(). Remove the comment before clear_bss() which stopped bringing much profit to the code readability. Otherwise describing all the new order dependencies would be too verbose. Signed-off-by: Alexander Popov Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-3-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kasan.h | 8 ++------ arch/x86/kernel/head64.c | 7 ++----- arch/x86/kernel/head_64.S | 29 ----------------------------- arch/x86/mm/kasan_init_64.c | 36 ++++++++++++++++++++++++++++++++++-- 4 files changed, 38 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 8b22422fbad8..74a2a8dc9908 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -14,15 +14,11 @@ #ifndef __ASSEMBLY__ -extern pte_t kasan_zero_pte[]; -extern pte_t kasan_zero_pmd[]; -extern pte_t kasan_zero_pud[]; - #ifdef CONFIG_KASAN -void __init kasan_map_early_shadow(pgd_t *pgd); +void __init kasan_early_init(void); void __init kasan_init(void); #else -static inline void kasan_map_early_shadow(pgd_t *pgd) { } +static inline void kasan_early_init(void) { } static inline void kasan_init(void) { } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 65c8985e3eb2..f129a9af6357 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -161,13 +161,12 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* Kill off the identity-map trampoline */ reset_early_page_tables(); - kasan_map_early_shadow(early_level4_pgt); - - /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); clear_page(init_level4_pgt); + kasan_early_init(); + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); @@ -182,8 +181,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; - kasan_map_early_shadow(init_level4_pgt); - x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e5c27f729a38..1d40ca8a73f2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -516,38 +516,9 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 -#ifdef CONFIG_KASAN -#define FILL(VAL, COUNT) \ - .rept (COUNT) ; \ - .quad (VAL) ; \ - .endr - -NEXT_PAGE(kasan_zero_pte) - FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512) -NEXT_PAGE(kasan_zero_pmd) - FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512) -NEXT_PAGE(kasan_zero_pud) - FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512) - -#undef FILL -#endif - - #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE -#ifdef CONFIG_KASAN -/* - * This page used as early shadow. We don't use empty_zero_page - * at early stages, stack instrumentation could write some garbage - * to this page. - * Latter we reuse it as zero shadow for large ranges of memory - * that allowed to access, but not instrumented by kasan - * (vmalloc/vmemmap ...). - */ -NEXT_PAGE(kasan_zero_page) - .skip PAGE_SIZE -#endif diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 4860906c6b9f..0e4a05fa34d7 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -11,7 +11,19 @@ extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_X_MAX]; -extern unsigned char kasan_zero_page[PAGE_SIZE]; +static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; + +/* + * This page used as early shadow. We don't use empty_zero_page + * at early stages, stack instrumentation could write some garbage + * to this page. + * Latter we reuse it as zero shadow for large ranges of memory + * that allowed to access, but not instrumented by kasan + * (vmalloc/vmemmap ...). + */ +static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; static int __init map_range(struct range *range) { @@ -36,7 +48,7 @@ static void __init clear_pgds(unsigned long start, pgd_clear(pgd_offset_k(start)); } -void __init kasan_map_early_shadow(pgd_t *pgd) +static void __init kasan_map_early_shadow(pgd_t *pgd) { int i; unsigned long start = KASAN_SHADOW_START; @@ -166,6 +178,26 @@ static struct notifier_block kasan_die_notifier = { }; #endif +void __init kasan_early_init(void) +{ + int i; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; + pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; + + for (i = 0; i < PTRS_PER_PTE; i++) + kasan_zero_pte[i] = __pte(pte_val); + + for (i = 0; i < PTRS_PER_PMD; i++) + kasan_zero_pmd[i] = __pmd(pmd_val); + + for (i = 0; i < PTRS_PER_PUD; i++) + kasan_zero_pud[i] = __pud(pud_val); + + kasan_map_early_shadow(early_level4_pgt); + kasan_map_early_shadow(init_level4_pgt); +} + void __init kasan_init(void) { int i; -- cgit v1.2.1 From 241d2c54c62fa0939fc9a9512b48ac3434e90a89 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:35 +0300 Subject: x86/kasan: Flush TLBs after switching CR3 load_cr3() doesn't cause tlb_flush if PGE enabled. This may cause tons of false positive reports spamming the kernel to death. To fix this __flush_tlb_all() should be called explicitly after CR3 changed. Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-4-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0e4a05fa34d7..5d26642e4e8a 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -208,6 +208,7 @@ void __init kasan_init(void) memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); load_cr3(early_level4_pgt); + __flush_tlb_all(); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -234,5 +235,6 @@ void __init kasan_init(void) memset(kasan_zero_page, 0, PAGE_SIZE); load_cr3(init_level4_pgt); + __flush_tlb_all(); init_task.kasan_depth = 0; } -- cgit v1.2.1 From d4f86beacc21d538dc41e1fc75a22e084f547edf Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:36 +0300 Subject: x86/kasan: Fix boot crash on AMD processors While populating zero shadow wrong bits in upper level page tables used. __PAGE_KERNEL_RO that was used for pgd/pud/pmd has _PAGE_BIT_GLOBAL set. Global bit is present only in the lowest level of the page translation hierarchy (ptes), and it should be zero in upper levels. This bug seems doesn't cause any troubles on Intel cpus, while on AMDs it cause kernel crash on boot. Use _KERNPG_TABLE bits for pgds/puds/pmds to fix this. Reported-by: Borislav Petkov Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-5-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 5d26642e4e8a..9a54dbe98064 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -85,7 +85,7 @@ static int __init zero_pmd_populate(pud_t *pud, unsigned long addr, while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) { WARN_ON(!pmd_none(*pmd)); set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PMD_SIZE; pmd = pmd_offset(pud, addr); } @@ -111,7 +111,7 @@ static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr, while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) { WARN_ON(!pud_none(*pud)); set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PUD_SIZE; pud = pud_offset(pgd, addr); } @@ -136,7 +136,7 @@ static int __init zero_pgd_populate(unsigned long addr, unsigned long end) while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) { WARN_ON(!pgd_none(*pgd)); set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PGDIR_SIZE; pgd = pgd_offset_k(addr); } -- cgit v1.2.1 From 8515522949951d81fe2d06c0a3292f171f2b8ec4 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:37 +0300 Subject: x86/kasan: Add message about KASAN being initialized Print informational message to tell user that kernel runs with KASAN enabled. Add a "kasan: " prefix to all messages in kasan_init_64.c. Signed-off-by: Andrey Ryabinin Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-6-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9a54dbe98064..e1840f3db5b5 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +#define pr_fmt(fmt) "kasan: " fmt #include #include #include @@ -237,4 +238,6 @@ void __init kasan_init(void) load_cr3(init_level4_pgt); __flush_tlb_all(); init_task.kasan_depth = 0; + + pr_info("Kernel address sanitizer initialized\n"); } -- cgit v1.2.1 From d6f2d75a7ae06ffd793bb504c4f0d1665548cffc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:38 +0300 Subject: x86/kasan: Move KASAN_SHADOW_OFFSET to the arch Kconfig KASAN_SHADOW_OFFSET is purely arch specific setting, so it should be in arch's Kconfig file. Signed-off-by: Andrey Ryabinin Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Paul Bolle Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-7-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 55bced17dc95..0b2929f49531 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -254,6 +254,11 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config KASAN_SHADOW_OFFSET + hex + depends on KASAN + default 0xdffffc0000000000 + config HAVE_INTEL_TXT def_bool y depends on INTEL_IOMMU && ACPI -- cgit v1.2.1 From 1db875631f8d5bbf497f67e47f254eece888d51d Mon Sep 17 00:00:00 2001 From: Zhu Guihua Date: Fri, 3 Jul 2015 17:37:18 +0800 Subject: x86/espfix: Add 'cpu' parameter to init_espfix_ap() Add a CPU index parameter to init_espfix_ap(), so that the parameter could be propagated to the function for espfix page allocation. Signed-off-by: Zhu Guihua Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cde3fcf1b3211f3f03feb1a995bce3fee850f0fc.1435824469.git.zhugh.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/espfix.h | 2 +- arch/x86/kernel/espfix_64.c | 7 +++---- arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index 99efebb2f69d..ca3ce9ab9385 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); extern void init_espfix_bsp(void); -extern void init_espfix_ap(void); +extern void init_espfix_ap(int cpu); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index f5d0730e7b08..1fb0e0003c94 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -131,12 +131,12 @@ void __init init_espfix_bsp(void) init_espfix_random(); /* The rest is the same as for any other processor */ - init_espfix_ap(); + init_espfix_ap(0); } -void init_espfix_ap(void) +void init_espfix_ap(int cpu) { - unsigned int cpu, page; + unsigned int page; unsigned long addr; pud_t pud, *pud_p; pmd_t pmd, *pmd_p; @@ -149,7 +149,6 @@ void init_espfix_ap(void) if (likely(this_cpu_read(espfix_stack))) return; /* Already initialized */ - cpu = smp_processor_id(); addr = espfix_base_addr(cpu); page = cpu/ESPFIX_STACKS_PER_PAGE; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8add66b22f33..6f5abac6c28b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -242,7 +242,7 @@ static void notrace start_secondary(void *unused) * Enable the espfix hack for this CPU */ #ifdef CONFIG_X86_ESPFIX64 - init_espfix_ap(); + init_espfix_ap(smp_processor_id()); #endif /* -- cgit v1.2.1 From 20d5e4a9cd453991e2590a4c25230a99b42dee0c Mon Sep 17 00:00:00 2001 From: Zhu Guihua Date: Fri, 3 Jul 2015 17:37:19 +0800 Subject: x86/espfix: Init espfix on the boot CPU side As we alloc pages with GFP_KERNEL in init_espfix_ap() which is called before we enable local irqs, so the lockdep sub-system would (correctly) trigger a warning about the potentially blocking API. So we allocate them on the boot CPU side when the secondary CPU is brought up by the boot CPU, and hand them over to the secondary CPU. And we use alloc_pages_node() with the secondary CPU's node, to make sure the espfix stack is NUMA-local to the CPU that is going to use it. Signed-off-by: Zhu Guihua Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c97add2670e9abebb90095369f0cfc172373ac94.1435824469.git.zhugh.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/espfix_64.c | 21 +++++++++++++-------- arch/x86/kernel/smpboot.c | 14 +++++++------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 1fb0e0003c94..ce95676abd60 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -141,12 +141,12 @@ void init_espfix_ap(int cpu) pud_t pud, *pud_p; pmd_t pmd, *pmd_p; pte_t pte, *pte_p; - int n; + int n, node; void *stack_page; pteval_t ptemask; /* We only have to do this once... */ - if (likely(this_cpu_read(espfix_stack))) + if (likely(per_cpu(espfix_stack, cpu))) return; /* Already initialized */ addr = espfix_base_addr(cpu); @@ -164,12 +164,15 @@ void init_espfix_ap(int cpu) if (stack_page) goto unlock_done; + node = cpu_to_node(cpu); ptemask = __supported_pte_mask; pud_p = &espfix_pud_page[pud_index(addr)]; pud = *pud_p; if (!pud_present(pud)) { - pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pmd_p = (pmd_t *)page_address(page); pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PUD_CLONES; n++) @@ -179,7 +182,9 @@ void init_espfix_ap(int cpu) pmd_p = pmd_offset(&pud, addr); pmd = *pmd_p; if (!pmd_present(pmd)) { - pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pte_p = (pte_t *)page_address(page); pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PMD_CLONES; n++) @@ -187,7 +192,7 @@ void init_espfix_ap(int cpu) } pte_p = pte_offset_kernel(&pmd, addr); - stack_page = (void *)__get_free_page(GFP_KERNEL); + stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); @@ -198,7 +203,7 @@ void init_espfix_ap(int cpu) unlock_done: mutex_unlock(&espfix_init_mutex); done: - this_cpu_write(espfix_stack, addr); - this_cpu_write(espfix_waddr, (unsigned long)stack_page - + (addr & ~PAGE_MASK)); + per_cpu(espfix_stack, cpu) = addr; + per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page + + (addr & ~PAGE_MASK); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6f5abac6c28b..0bd8c1d507b3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -238,13 +238,6 @@ static void notrace start_secondary(void *unused) */ check_tsc_sync_target(); - /* - * Enable the espfix hack for this CPU - */ -#ifdef CONFIG_X86_ESPFIX64 - init_espfix_ap(smp_processor_id()); -#endif - /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding @@ -854,6 +847,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; + /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_ESPFIX64 + init_espfix_ap(cpu); +#endif + /* So we see what's up */ announce_cpu(cpu, apicid); -- cgit v1.2.1 From 827a82ff399523a954253dfea401af748640f0f4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Jul 2015 10:14:34 -0400 Subject: x86/earlyprintk: Allow early_printk() to use console style parameters like '115200n8' When I enable early_printk on a kernel, I cut and paste the console= input and add to earlyprintk parameter. But I notice recently that ktest has not been detecting triple faults. The way it detects it, is by seeing the kernel banner "Linux version .." with a different kernel version pop up. Then I noticed that early printk was no longer working on my console, which was why ktest was not seeing it. I bisected it down and it was added to 4.0 with this commit: ea9e9d802902 ("Specify PCI based UART for earlyprintk") because it converted the simple_strtoul() that converts the baud number into a kstrtoul(). The problem with this is, I had as my baud rate, 115200n8 (acceptable for console=ttyS0), but because of the "n8", the kstrtoul() doesn't parse the baud rate and returns an error, which sets the baud rate to the default 9600. This explains the garbage on my screen. Now, earlyprintk= kernel parameter does not say it accepts that format. Thus, one answer would simply be me changing my kernel parameters to remove the "n8" since it isn't parsed anyway. But I wonder if other people run into this, and it seems strange that the two consoles for serial accepts different input. I could also extend this to have earlyprintk do something with that "n8" or whatever it has and have it match the console parsing (which, BTW, still uses simple_strtoul(), as I guess it has to). This patch just makes my old kernel parameter parsing work like it use to. Although, simple_strtoull() is considered obsolete, it is the only standard string parsing function that parses a number that is attached to text. Ironically, commit ea9e9d802902 also added several calls to simple_strtoul()! Signed-off-by: Steven Rostedt Cc: Alan Cox Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Borislav Petkov Cc: Brian Gerst Cc: David Cohen Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stuart R. Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150706101434.5f6a351b@gandalf.local.home [ Cleaned it up a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 89427d8d4fc5..eec40f595ab9 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -175,7 +175,9 @@ static __init void early_serial_init(char *s) } if (*s) { - if (kstrtoul(s, 0, &baud) < 0 || baud == 0) + baud = simple_strtoull(s, &e, 0); + + if (baud == 0 || s == e) baud = DEFAULT_BAUD; } -- cgit v1.2.1 From 5a3f75e3f02836518ce49536e9c460ca8e1fa290 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:32 +0000 Subject: x86/irq: Plug irq vector hotplug race Jin debugged a nasty cpu hotplug race which results in leaking a irq vector on the newly hotplugged cpu. cpu N cpu M native_cpu_up device_shutdown do_boot_cpu free_msi_irqs start_secondary arch_teardown_msi_irqs smp_callin default_teardown_msi_irqs setup_vector_irq arch_teardown_msi_irq __setup_vector_irq native_teardown_msi_irq lock(vector_lock) destroy_irq install vectors unlock(vector_lock) lock(vector_lock) ---> __clear_irq_vector unlock(vector_lock) lock(vector_lock) set_cpu_online unlock(vector_lock) This leaves the irq vector(s) which are torn down on CPU M stale in the vector array of CPU N, because CPU M does not see CPU N online yet. There is a similar issue with concurrent newly setup interrupts. The alloc/free protection of irq descriptors does not prevent the above race, because it merily prevents interrupt descriptors from going away or changing concurrently. Prevent this by moving the call to setup_vector_irq() into the vector_lock held region which protects set_cpu_online(): cpu N cpu M native_cpu_up device_shutdown do_boot_cpu free_msi_irqs start_secondary arch_teardown_msi_irqs smp_callin default_teardown_msi_irqs lock(vector_lock) arch_teardown_msi_irq setup_vector_irq() __setup_vector_irq native_teardown_msi_irq install vectors destroy_irq set_cpu_online unlock(vector_lock) lock(vector_lock) __clear_irq_vector unlock(vector_lock) So cpu M either sees the cpu N online before clearing the vector or cpu N installs the vectors after cpu M has cleared it. Reported-by: xiao jin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.141898931@linutronix.de --- arch/x86/kernel/apic/vector.c | 10 ++-------- arch/x86/kernel/smpboot.c | 13 +++++-------- 2 files changed, 7 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 28eba2d38b15..f813261d9740 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -409,12 +409,6 @@ static void __setup_vector_irq(int cpu) int irq, vector; struct apic_chip_data *data; - /* - * vector_lock will make sure that we don't run into irq vector - * assignments that might be happening on another cpu in parallel, - * while we setup our initial vector to irq mappings. - */ - raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { data = apic_chip_data(irq_get_irq_data(irq)); @@ -436,16 +430,16 @@ static void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, data->domain)) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } - raw_spin_unlock(&vector_lock); } /* - * Setup the vector to irq mappings. + * Setup the vector to irq mappings. Must be called with vector_lock held. */ void setup_vector_irq(int cpu) { int irq; + lockdep_assert_held(&vector_lock); /* * On most of the platforms, legacy PIC delivers the interrupts on the * boot cpu. But there are certain platforms where PIC interrupts are diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0bd8c1d507b3..d3010aa79daf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -170,11 +170,6 @@ static void smp_callin(void) */ apic_ap_setup(); - /* - * Need to setup vector mappings before we enable interrupts. - */ - setup_vector_irq(smp_processor_id()); - /* * Save our processor parameters. Note: this information * is needed for clock calibration. @@ -239,11 +234,13 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* - * We need to hold vector_lock so there the set of online cpus - * does not change while we are assigning vectors to cpus. Holding - * this lock ensures we don't half assign or remove an irq from a cpu. + * Lock vector_lock and initialize the vectors on this cpu + * before setting the cpu online. We must set it online with + * vector_lock held to prevent a concurrent setup/teardown + * from seeing a half valid vector space. */ lock_vector_lock(); + setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); cpu_set_state_online(smp_processor_id()); -- cgit v1.2.1 From cbb24dc761d95fe39a7a122bb1b298e9604cae15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:33 +0000 Subject: x86/irq: Use proper locking in check_irq_vectors_for_cpu_disable() It's unsafe to examine fields in the irq descriptor w/o holding the descriptor lock. Add proper locking. While at it add a comment why the vector check can run lock less Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.236544164@linutronix.de --- arch/x86/kernel/irq.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 88b366487b0e..85ca76e6241c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -347,14 +347,22 @@ int check_irq_vectors_for_cpu_disable(void) if (!desc) continue; + /* + * Protect against concurrent action removal, + * affinity changes etc. + */ + raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); cpumask_clear_cpu(this_cpu, &affinity_new); /* Do not count inactive or per-cpu irqs. */ - if (!irq_has_action(irq) || irqd_is_per_cpu(data)) + if (!irq_has_action(irq) || irqd_is_per_cpu(data)) { + raw_spin_unlock(&desc->lock); continue; + } + raw_spin_unlock(&desc->lock); /* * A single irq may be mapped to multiple * cpu's vector_irq[] (for example IOAPIC cluster @@ -385,6 +393,9 @@ int check_irq_vectors_for_cpu_disable(void) * vector. If the vector is marked in the used vectors * bitmap or an irq is assigned to it, we don't count * it as available. + * + * As this is an inaccurate snapshot anyway, we can do + * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; vector < first_system_vector; vector++) { -- cgit v1.2.1 From 09cf92b784fae6109450c5d64f9908066d605249 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:35 +0000 Subject: x86/irq: Retrieve irq data after locking irq_desc irq_data is protected by irq_desc->lock, so retrieving the irq chip from irq_data outside the lock is racy vs. an concurrent update. Move it into the lock held region. While at it add a comment why the vector walk does not require vector_lock. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.331320612@linutronix.de --- arch/x86/kernel/irq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 85ca76e6241c..c7dfe1be784e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -497,6 +497,11 @@ void fixup_irqs(void) */ mdelay(1); + /* + * We can walk the vector array of this cpu without holding + * vector_lock because the cpu is already marked !online, so + * nothing else will touch it. + */ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; @@ -508,9 +513,9 @@ void fixup_irqs(void) irq = __this_cpu_read(vector_irq[vector]); desc = irq_to_desc(irq); + raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); - raw_spin_lock(&desc->lock); if (chip->irq_retrigger) { chip->irq_retrigger(data); __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); -- cgit v1.2.1 From 69711ca19b06d1b33d8f21213b540b5d1c638dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Hinderer?= Date: Wed, 8 Jul 2015 00:02:01 +0200 Subject: x86/kconfig: Fix typo in the CONFIG_CMDLINE_BOOL help text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien Hinderer Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Samuel Thibault Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0b2929f49531..3dbb7e7909ca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2020,7 +2020,7 @@ config CMDLINE_BOOL To compile command line arguments into the kernel, set this option to 'Y', then fill in the - the boot arguments in CONFIG_CMDLINE. + boot arguments in CONFIG_CMDLINE. Systems with fully functional boot loaders (i.e. non-embedded) should leave this option set to 'N'. -- cgit v1.2.1