diff options
Diffstat (limited to 'arch/x86_64')
-rw-r--r-- | arch/x86_64/Kconfig | 8 | ||||
-rw-r--r-- | arch/x86_64/Kconfig.debug | 4 | ||||
-rw-r--r-- | arch/x86_64/boot/setup.S | 2 | ||||
-rw-r--r-- | arch/x86_64/ia32/ia32entry.S | 19 | ||||
-rw-r--r-- | arch/x86_64/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86_64/kernel/entry.S | 188 | ||||
-rw-r--r-- | arch/x86_64/kernel/head64.c | 5 | ||||
-rw-r--r-- | arch/x86_64/kernel/irq.c | 4 | ||||
-rw-r--r-- | arch/x86_64/kernel/nmi.c | 2 | ||||
-rw-r--r-- | arch/x86_64/kernel/process.c | 2 | ||||
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 2 | ||||
-rw-r--r-- | arch/x86_64/kernel/stacktrace.c | 221 | ||||
-rw-r--r-- | arch/x86_64/kernel/traps.c | 129 | ||||
-rw-r--r-- | arch/x86_64/lib/thunk.S | 5 | ||||
-rw-r--r-- | arch/x86_64/mm/fault.c | 1 |
15 files changed, 496 insertions, 97 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index e856804c447f..28df7d88ce2c 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -24,6 +24,14 @@ config X86 bool default y +config LOCKDEP_SUPPORT + bool + default y + +config STACKTRACE_SUPPORT + bool + default y + config SEMAPHORE_SLEEPERS bool default y diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug index 1d92ab56c0f9..775d211a5cf9 100644 --- a/arch/x86_64/Kconfig.debug +++ b/arch/x86_64/Kconfig.debug @@ -1,5 +1,9 @@ menu "Kernel hacking" +config TRACE_IRQFLAGS_SUPPORT + bool + default y + source "lib/Kconfig.debug" config DEBUG_RODATA diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 7de8b8fd1685..a50b631f4d2b 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -46,7 +46,7 @@ */ #include <asm/segment.h> -#include <linux/version.h> +#include <linux/utsrelease.h> #include <linux/compile.h> #include <asm/boot.h> #include <asm/e820.h> diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index c536fa98ea37..9b5bb413a6e9 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -13,6 +13,7 @@ #include <asm/thread_info.h> #include <asm/segment.h> #include <asm/vsyscall32.h> +#include <asm/irqflags.h> #include <linux/linkage.h> #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) @@ -75,6 +76,10 @@ ENTRY(ia32_sysenter_target) swapgs movq %gs:pda_kernelstack, %rsp addq $(PDA_STACKOFFSET),%rsp + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs, here we enable it straight after entry: + */ sti movl %ebp,%ebp /* zero extension */ pushq $__USER32_DS @@ -118,6 +123,7 @@ sysenter_do_call: movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli + TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) jnz int_ret_from_sys_call andl $~TS_COMPAT,threadinfo_status(%r10) @@ -132,6 +138,7 @@ sysenter_do_call: CFI_REGISTER rsp,rcx movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ CFI_REGISTER rip,rdx + TRACE_IRQS_ON swapgs sti /* sti only takes effect after the next instruction */ /* sysexit */ @@ -186,6 +193,10 @@ ENTRY(ia32_cstar_target) movl %esp,%r8d CFI_REGISTER rsp,r8 movq %gs:pda_kernelstack,%rsp + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: + */ sti SAVE_ARGS 8,1,1 movl %eax,%eax /* zero extension */ @@ -220,6 +231,7 @@ cstar_do_call: movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli + TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) jnz int_ret_from_sys_call andl $~TS_COMPAT,threadinfo_status(%r10) @@ -228,6 +240,7 @@ cstar_do_call: CFI_REGISTER rip,rcx movl EFLAGS-ARGOFFSET(%rsp),%r11d /*CFI_REGISTER rflags,r11*/ + TRACE_IRQS_ON movl RSP-ARGOFFSET(%rsp),%esp CFI_RESTORE rsp swapgs @@ -286,7 +299,11 @@ ENTRY(ia32_syscall) /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ /*CFI_REL_OFFSET cs,CS-RIP*/ CFI_REL_OFFSET rip,RIP-RIP - swapgs + swapgs + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: + */ sti movl %eax,%eax pushq %rax diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 819e84ec5b64..b5aaeafc1cd3 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -10,6 +10,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ pci-dma.o pci-nommu.o alternative.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ed92c2983606..d464dded68c0 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -41,13 +41,24 @@ #include <asm/thread_info.h> #include <asm/hw_irq.h> #include <asm/page.h> +#include <asm/irqflags.h> .code64 #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif - + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + /* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based @@ -194,6 +205,10 @@ ENTRY(system_call) swapgs movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp + /* + * No need to follow this irqs off/on section - it's straight + * and short: + */ sti SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) @@ -219,10 +234,15 @@ ret_from_sys_call: sysret_check: GET_THREAD_INFO(%rcx) cli + TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx CFI_REMEMBER_STATE jnz sysret_careful + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON movq RIP-ARGOFFSET(%rsp),%rcx CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 @@ -237,6 +257,7 @@ sysret_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc sysret_signal + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -247,6 +268,7 @@ sysret_careful: /* Handle a signal */ sysret_signal: + TRACE_IRQS_ON sti testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz 1f @@ -261,6 +283,7 @@ sysret_signal: /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli + TRACE_IRQS_OFF jmp int_with_check badsys: @@ -309,6 +332,7 @@ ENTRY(int_ret_from_sys_call) CFI_REL_OFFSET r10,R10-ARGOFFSET CFI_REL_OFFSET r11,R11-ARGOFFSET cli + TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi @@ -327,6 +351,7 @@ int_with_check: int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -334,10 +359,12 @@ int_careful: popq %rdi CFI_ADJUST_CFA_OFFSET -8 cli + TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: + TRACE_IRQS_ON sti SAVE_REST /* Check for syscall exit trace */ @@ -351,6 +378,7 @@ int_very_careful: CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi cli + TRACE_IRQS_OFF jmp int_restore_rest int_signal: @@ -363,6 +391,7 @@ int_signal: int_restore_rest: RESTORE_REST cli + TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC END(int_ret_from_sys_call) @@ -484,6 +513,10 @@ END(stub_rt_sigreturn) swapgs 1: incl %gs:pda_irqcount # RED-PEN should check preempt count cmoveq %gs:pda_irqstackptr,%rsp + /* + * We entered an interrupt context - irqs are off: + */ + TRACE_IRQS_OFF call \func .endm @@ -493,6 +526,7 @@ ENTRY(common_interrupt) /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: cli + TRACE_IRQS_OFF decl %gs:pda_irqcount leaveq CFI_DEF_CFA_REGISTER rsp @@ -515,9 +549,21 @@ retint_check: CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: + /* + * The iretq could re-enable interrupts: + */ + cli + TRACE_IRQS_IRETQ swapgs + jmp restore_args + retint_restore_args: cli + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ +restore_args: RESTORE_ARGS 0,8,0 iret_label: iretq @@ -530,6 +576,7 @@ iret_label: /* running with kernel gs */ bad_iret: movq $11,%rdi /* SIGSEGV */ + TRACE_IRQS_ON sti jmp do_exit .previous @@ -539,6 +586,7 @@ retint_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -547,11 +595,13 @@ retint_careful: CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) cli + TRACE_IRQS_OFF jmp retint_check retint_signal: testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz retint_swapgs + TRACE_IRQS_ON sti SAVE_REST movq $-1,ORIG_RAX(%rsp) @@ -560,6 +610,7 @@ retint_signal: call do_notify_resume RESTORE_REST cli + TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) jmp retint_check @@ -666,7 +717,7 @@ END(spurious_interrupt) /* error code is on the stack already */ /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0 + .macro paranoidentry sym, ist=0, irqtrace=1 SAVE_ALL cld movl $1,%ebx @@ -691,8 +742,73 @@ END(spurious_interrupt) addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif cli + .if \irqtrace + TRACE_IRQS_OFF + .endif .endm - + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + .macro paranoidexit trace=1 + /* ebx: no swapgs flag */ +paranoid_exit\trace: + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore\trace + testl $3,CS(%rsp) + jnz paranoid_userspace\trace +paranoid_swapgs\trace: + TRACE_IRQS_IRETQ 0 + swapgs +paranoid_restore\trace: + RESTORE_ALL 8 + iretq +paranoid_userspace\trace: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs\trace + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule\trace + movl %ebx,%edx /* arg3: thread flags */ + .if \trace + TRACE_IRQS_ON + .endif + sti + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace +paranoid_schedule\trace: + .if \trace + TRACE_IRQS_ON + .endif + sti + call schedule + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace + CFI_ENDPROC + .endm + /* * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. @@ -748,6 +864,7 @@ error_exit: movl %ebx,%eax RESTORE_REST cli + TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax jne retint_kernel @@ -755,6 +872,10 @@ error_exit: movl $_TIF_WORK_MASK,%edi andl %edi,%edx jnz retint_careful + /* + * The iret might restore flags: + */ + TRACE_IRQS_IRETQ swapgs RESTORE_ARGS 0,8,0 jmp iret_label @@ -916,8 +1037,7 @@ KPROBE_ENTRY(debug) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK - jmp paranoid_exit - CFI_ENDPROC + paranoidexit END(debug) .previous .text @@ -926,49 +1046,13 @@ KPROBE_ENTRY(nmi) INTR_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - */ - /* ebx: no swapgs flag */ -paranoid_exit: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: - swapgs -paranoid_restore: - RESTORE_ALL 8 - iretq -paranoid_userspace: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule - movl %ebx,%edx /* arg3: thread flags */ - sti - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - cli - jmp paranoid_userspace -paranoid_schedule: - sti - call schedule - cli - jmp paranoid_userspace - CFI_ENDPROC + paranoidentry do_nmi, 0, 0 +#ifdef CONFIG_TRACE_IRQFLAGS + paranoidexit 0 +#else + jmp paranoid_exit1 + CFI_ENDPROC +#endif END(nmi) .previous .text @@ -977,7 +1061,7 @@ KPROBE_ENTRY(int3) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(int3) .previous .text @@ -1006,7 +1090,7 @@ END(reserved) ENTRY(double_fault) XCPT_FRAME paranoidentry do_double_fault - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(double_fault) @@ -1022,7 +1106,7 @@ END(segment_not_present) ENTRY(stack_segment) XCPT_FRAME paranoidentry do_stack_segment - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(stack_segment) @@ -1050,7 +1134,7 @@ ENTRY(machine_check) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(machine_check) #endif diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index e6a71c9556d9..36647ce6aecb 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -85,6 +85,11 @@ void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); /* + * This must be called really, really early: + */ + lockdep_init(); + + /* * switch to init_level4_pgt from boot_level4_pgt */ memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index a1f1df5f7bfc..5221a53e90c1 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -177,8 +177,10 @@ asmlinkage void do_softirq(void) local_irq_save(flags); pending = local_softirq_pending(); /* Switch to interrupt stack */ - if (pending) + if (pending) { call_softirq(); + WARN_ON_ONCE(softirq_count()); + } local_irq_restore(flags); } EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 476c1472fc07..5baa0c726e97 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -127,7 +127,7 @@ void __cpuinit nmi_watchdog_default(void) static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; - local_irq_enable(); + local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index ca56e19b8b6e..bb6745d13b8f 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -296,7 +296,7 @@ void __show_regs(struct pt_regs * regs) system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->rax, regs->rbx, regs->rcx); diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 9705a6a384f1..b7c705969791 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -775,6 +775,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) }; DECLARE_WORK(work, do_fork_idle, &c_idle); + lockdep_set_class(&c_idle.done.wait.lock, &waitqueue_lock_key); + /* allocate memory for gdts of secondary cpus. Hotplug is considered */ if (!cpu_gdt_descr[cpu].address && !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c new file mode 100644 index 000000000000..32cf55eb9af8 --- /dev/null +++ b/arch/x86_64/kernel/stacktrace.c @@ -0,0 +1,221 @@ +/* + * arch/x86_64/kernel/stacktrace.c + * + * Stack trace management functions + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + */ +#include <linux/sched.h> +#include <linux/stacktrace.h> + +#include <asm/smp.h> + +static inline int +in_range(unsigned long start, unsigned long addr, unsigned long end) +{ + return addr >= start && addr <= end; +} + +static unsigned long +get_stack_end(struct task_struct *task, unsigned long stack) +{ + unsigned long stack_start, stack_end, flags; + int i, cpu; + + /* + * The most common case is that we are in the task stack: + */ + stack_start = (unsigned long)task->thread_info; + stack_end = stack_start + THREAD_SIZE; + + if (in_range(stack_start, stack, stack_end)) + return stack_end; + + /* + * We are in an interrupt if irqstackptr is set: + */ + raw_local_irq_save(flags); + cpu = safe_smp_processor_id(); + stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr; + + if (stack_end) { + stack_start = stack_end & ~(IRQSTACKSIZE-1); + if (in_range(stack_start, stack, stack_end)) + goto out_restore; + /* + * We get here if we are in an IRQ context but we + * are also in an exception stack. + */ + } + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (i = 0; i < N_EXCEPTION_STACKS; i++) { + /* + * set 'end' to the end of the exception stack. + */ + stack_end = per_cpu(init_tss, cpu).ist[i]; + stack_start = stack_end - EXCEPTION_STKSZ; + + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= stack_end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= stack_start) + goto out_restore; + + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) { + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + stack_end -= EXCEPTION_STKSZ; + stack_start -= EXCEPTION_STKSZ; + } while (stack < stack_start); + + goto out_restore; + } +#endif + } + /* + * Ok, 'stack' is not pointing to any of the system stacks. + */ + stack_end = 0; + +out_restore: + raw_local_irq_restore(flags); + + return stack_end; +} + + +/* + * Save stack-backtrace addresses into a stack_trace buffer: + */ +static inline unsigned long +save_context_stack(struct stack_trace *trace, unsigned int skip, + unsigned long stack, unsigned long stack_end) +{ + unsigned long addr; + +#ifdef CONFIG_FRAME_POINTER + unsigned long prev_stack = 0; + + while (in_range(prev_stack, stack, stack_end)) { + pr_debug("stack: %p\n", (void *)stack); + addr = (unsigned long)(((unsigned long *)stack)[1]); + pr_debug("addr: %p\n", (void *)addr); + if (!skip) + trace->entries[trace->nr_entries++] = addr-1; + else + skip--; + if (trace->nr_entries >= trace->max_entries) + break; + if (!addr) + return 0; + /* + * Stack frames must go forwards (otherwise a loop could + * happen if the stackframe is corrupted), so we move + * prev_stack forwards: + */ + prev_stack = stack; + stack = (unsigned long)(((unsigned long *)stack)[0]); + } + pr_debug("invalid: %p\n", (void *)stack); +#else + while (stack < stack_end) { + addr = ((unsigned long *)stack)[0]; + stack += sizeof(long); + if (__kernel_text_address(addr)) { + if (!skip) + trace->entries[trace->nr_entries++] = addr-1; + else + skip--; + if (trace->nr_entries >= trace->max_entries) + break; + } + } +#endif + return stack; +} + +#define MAX_STACKS 10 + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + * If all_contexts is set, all contexts (hardirq, softirq and process) + * are saved. If not set then only the current context is saved. + */ +void save_stack_trace(struct stack_trace *trace, + struct task_struct *task, int all_contexts, + unsigned int skip) +{ + unsigned long stack = (unsigned long)&stack; + int i, nr_stacks = 0, stacks_done[MAX_STACKS]; + + WARN_ON(trace->nr_entries || !trace->max_entries); + + if (!task) + task = current; + + pr_debug("task: %p, ti: %p\n", task, task->thread_info); + + if (!task || task == current) { + /* Grab rbp right from our regs: */ + asm ("mov %%rbp, %0" : "=r" (stack)); + pr_debug("rbp: %p\n", (void *)stack); + } else { + /* rbp is the last reg pushed by switch_to(): */ + stack = task->thread.rsp; + pr_debug("other task rsp: %p\n", (void *)stack); + stack = (unsigned long)(((unsigned long *)stack)[0]); + pr_debug("other task rbp: %p\n", (void *)stack); + } + + while (1) { + unsigned long stack_end = get_stack_end(task, stack); + + pr_debug("stack: %p\n", (void *)stack); + pr_debug("stack end: %p\n", (void *)stack_end); + + /* + * Invalid stack addres? + */ + if (!stack_end) + return; + /* + * Were we in this stack already? (recursion) + */ + for (i = 0; i < nr_stacks; i++) + if (stacks_done[i] == stack_end) + return; + stacks_done[nr_stacks] = stack_end; + + stack = save_context_stack(trace, skip, stack, stack_end); + if (!all_contexts || !stack || + trace->nr_entries >= trace->max_entries) + return; + trace->entries[trace->nr_entries++] = ULONG_MAX; + if (trace->nr_entries >= trace->max_entries) + return; + if (++nr_stacks >= MAX_STACKS) + return; + } +} + diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 5a5311d3de0f..79d05c482072 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -110,28 +110,31 @@ static int kstack_depth_to_print = 12; static int call_trace = 1; #ifdef CONFIG_KALLSYMS -#include <linux/kallsyms.h> -int printk_address(unsigned long address) -{ +# include <linux/kallsyms.h> +void printk_address(unsigned long address) +{ unsigned long offset = 0, symsize; const char *symname; char *modname; - char *delim = ":"; + char *delim = ":"; char namebuf[128]; - symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); - if (!symname) - return printk("[<%016lx>]", address); - if (!modname) + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%016lx>]\n", address); + return; + } + if (!modname) modname = delim = ""; - return printk("<%016lx>{%s%s%s%s%+ld}", - address, delim, modname, delim, symname, offset); -} + printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", + address, delim, modname, delim, symname, offset, symsize); +} #else -int printk_address(unsigned long address) -{ - return printk("[<%016lx>]", address); -} +void printk_address(unsigned long address) +{ + printk(" [<%016lx>]\n", address); +} #endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, @@ -149,10 +152,22 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, }; unsigned k; + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { unsigned long end; + /* + * set 'end' to the end of the exception stack. + */ switch (k + 1) { + /* + * TODO: this block is not needed i think, because + * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] + * properly too. + */ #if DEBUG_STKSZ > EXCEPTION_STKSZ case DEBUG_STACK: end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; @@ -162,19 +177,43 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, end = per_cpu(init_tss, cpu).ist[k]; break; } + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ if (stack >= end) continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ if (*usedp & (1U << k)) break; *usedp |= 1U << k; *idp = ids[k]; return (unsigned long *)end; } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ #if DEBUG_STKSZ > EXCEPTION_STKSZ if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { unsigned j = N_EXCEPTION_STACKS - 1; + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ do { ++j; end -= EXCEPTION_STKSZ; @@ -193,20 +232,14 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, static int show_trace_unwind(struct unwind_frame_info *info, void *context) { - int i = 11, n = 0; + int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { - ++n; - if (i > 50) { - printk("\n "); - i = 7; - } else - i += printk(" "); - i += printk_address(UNW_PC(info)); + n++; + printk_address(UNW_PC(info)); if (arch_unw_user_mode(info)) break; } - printk("\n"); return n; } @@ -224,7 +257,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s int i = 11; unsigned used = 0; - printk("\nCall Trace:"); + printk("\nCall Trace:\n"); if (!tsk) tsk = current; @@ -250,16 +283,15 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s } } + /* + * Print function call entries within a stack. 'cond' is the + * "end of stackframe" condition, that the 'stack++' + * iteration will eventually trigger. + */ #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ if (kernel_text_address(addr)) { \ - if (i > 50) { \ - printk("\n "); \ - i = 0; \ - } \ - else \ - i += printk(" "); \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ @@ -268,20 +300,30 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * down the cause of the crash will be able to figure \ * out the call path that was taken. \ */ \ - i += printk_address(addr); \ + printk_address(addr); \ } \ } while (0) - for(; ; ) { + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + for ( ; ; ) { const char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, &used, &id); if (estack_end) { - i += printk(" <%s>", id); + printk(" <%s>", id); HANDLE_STACK (stack < estack_end); - i += printk(" <EOE>"); + printk(" <EOE>"); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ stack = (unsigned long *) estack_end[-2]; continue; } @@ -291,19 +333,28 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s (IRQSTACKSIZE - 64) / sizeof(*irqstack); if (stack >= irqstack && stack < irqstack_end) { - i += printk(" <IRQ>"); + printk(" <IRQ>"); HANDLE_STACK (stack < irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; - i += printk(" <EOI>"); + printk(" <EOI>"); continue; } } break; } + /* + * This prints the process stack: + */ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK + printk("\n"); } @@ -337,8 +388,8 @@ static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned break; } if (i && ((i % 4) == 0)) - printk("\n "); - printk("%016lx ", *stack++); + printk("\n"); + printk(" %016lx", *stack++); touch_nmi_watchdog(); } show_trace(tsk, regs, rsp); diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index e49af0032e94..332ea5dff916 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -47,6 +47,11 @@ thunk_retrax __down_failed_interruptible,__down_interruptible thunk_retrax __down_failed_trylock,__down_trylock thunk __up_wakeup,__up + +#ifdef CONFIG_TRACE_IRQFLAGS + thunk trace_hardirqs_on_thunk,trace_hardirqs_on + thunk trace_hardirqs_off_thunk,trace_hardirqs_off +#endif /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 5afcf6eb00fa..ac8ea66ccb94 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -570,7 +570,6 @@ no_context: printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at %016lx RIP: \n" KERN_ALERT,address); printk_address(regs->rip); - printk("\n"); dump_pagetable(address); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; |