From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Jan 2009 19:01:40 -0500 Subject: trace, lockdep: manual preempt count adding for local_bh_disable Impact: fix to preempt trace triggering lockdep check_flag failure In local_bh_disable, the use of add_preempt_count causes the preempt tracer to start recording the time preemption is off. But because it already modified the preempt_count to show softirqs disabled, and before it called the lockdep code to handle this, it causes a state that lockdep can not handle. The preempt tracer will reset the ring buffer on start of a trace, and the ring buffer reset code does a spin_lock_irqsave. This calls into lockdep and lockdep will fail when it detects the invalid state of having softirqs disabled but the internal current->softirqs_enabled is still set. The fix is to manually add the SOFTIRQ_OFFSET to preempt count and call the preempt tracer code outside the lockdep critical area. Thanks to Peter Zijlstra for suggesting this solution. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f8..33085b88f87b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern unsigned long get_parent_ip(unsigned long addr); + struct seq_file; struct cfs_rq; struct task_group; -- cgit v1.2.3 From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 01:10:15 -0400 Subject: function-graph: ignore times across schedule Impact: more accurate timings The current method of function graph tracing does not take into account the time spent when a task is not running. This shows functions that call schedule have increased costs: 3) + 18.664 us | } ------------------------------------------ 3) -0 => kblockd-123 ------------------------------------------ 3) | finish_task_switch() { 3) 1.441 us | _spin_unlock_irq(); 3) 3.966 us | } 3) ! 2959.433 us | } 3) ! 2961.465 us | } This patch uses the tracepoint in the scheduling context switch to account for time that has elapsed while a task is scheduled out. Now we see: ------------------------------------------ 3) -0 => edac-po-1067 ------------------------------------------ 3) | finish_task_switch() { 3) 0.685 us | _spin_unlock_irq(); 3) 2.331 us | } 3) + 41.439 us | } 3) + 42.663 us | } Signed-off-by: Steven Rostedt --- include/linux/sched.h | 2 ++ kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 89cd308cc7a5..471e36d30123 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1409,6 +1409,8 @@ struct task_struct { int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack *ret_stack; + /* time stamp for last schedule */ + unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c81a759fbf76..0b90364d1a2c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,6 +29,8 @@ #include #include +#include + #include #include "trace.h" @@ -2590,6 +2592,31 @@ free: return ret; } +static void +ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, + struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + /* Allocate a return stack for each task */ static int start_graph_tracing(void) { @@ -2611,6 +2638,13 @@ static int start_graph_tracing(void) ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + kfree(ret_stack_list); return ret; } @@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void) mutex_lock(&ftrace_lock); atomic_dec(&ftrace_graph_active); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); @@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t) t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; } else t->ret_stack = NULL; } -- cgit v1.2.3