diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/Kconfig | 22 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 20 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 144 | ||||
-rw-r--r-- | kernel/trace/power-traces.c | 5 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 368 | ||||
-rw-r--r-- | kernel/trace/trace.c | 40 | ||||
-rw-r--r-- | kernel/trace/trace.h | 4 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c | 62 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 67 | ||||
-rw-r--r-- | kernel/trace/trace_export.c | 14 | ||||
-rw-r--r-- | kernel/trace/trace_functions_graph.c | 209 | ||||
-rw-r--r-- | kernel/trace/trace_irqsoff.c | 152 | ||||
-rw-r--r-- | kernel/trace/trace_kdb.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 46 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 256 | ||||
-rw-r--r-- | kernel/trace/trace_selftest.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_workqueue.c | 10 |
18 files changed, 967 insertions, 456 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 538501c6ea50..14674dce77a6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_C_RECORDMCOUNT + bool + help + C version of recordmcount available? + config TRACER_MAX_TRACE bool @@ -64,6 +69,21 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool +config EVENT_POWER_TRACING_DEPRECATED + depends on EVENT_TRACING + bool "Deprecated power event trace API, to be removed" + default y + help + Provides old power event types: + C-state/idle accounting events: + power:power_start + power:power_end + and old cpufreq accounting event: + power:power_frequency + This is for userspace compatibility + and will vanish after 5 kernel iterations, + namely 2.6.41. + config CONTEXT_SWITCH_TRACER bool @@ -121,7 +141,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER + select FRAME_POINTER if !ARM_UNWIND && !S390 select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 959f8d6c8cc1..7b8ec0281548 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,7 +23,6 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/debugfs.h> -#include <linux/smp_lock.h> #include <linux/time.h> #include <linux/uaccess.h> @@ -169,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; -#define BLK_TC_HARDBARRIER BLK_TC_BARRIER #define BLK_TC_RAHEAD BLK_TC_AHEAD /* The ilog2() calls fall out because they're constant */ @@ -197,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, HARDBARRIER); what |= MASK_TC_BIT(rw, SYNC); what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); @@ -326,6 +323,7 @@ static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, .open = blk_dropped_open, .read = blk_dropped_read, + .llseek = default_llseek, }; static int blk_msg_open(struct inode *inode, struct file *filp) @@ -365,6 +363,7 @@ static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, .open = blk_msg_open, .write = blk_msg_write, + .llseek = noop_llseek, }; /* @@ -639,7 +638,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - lock_kernel(); mutex_lock(&bdev->bd_mutex); switch (cmd) { @@ -667,7 +665,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) } mutex_unlock(&bdev->bd_mutex); - unlock_kernel(); return ret; } @@ -1652,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct block_device *bdev; ssize_t ret = -ENXIO; - lock_kernel(); bdev = bdget(part_devt(p)); if (bdev == NULL) - goto out_unlock_kernel; + goto out; q = blk_trace_get_queue(bdev); if (q == NULL) @@ -1683,8 +1679,7 @@ out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); -out_unlock_kernel: - unlock_kernel(); +out: return ret; } @@ -1714,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = -ENXIO; - lock_kernel(); p = dev_to_part(dev); bdev = bdget(part_devt(p)); if (bdev == NULL) - goto out_unlock_kernel; + goto out; q = blk_trace_get_queue(bdev); if (q == NULL) @@ -1753,8 +1747,6 @@ out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); -out_unlock_kernel: - unlock_kernel(); out: return ret ? ret : count; } @@ -1813,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_HARDBARRIER) - rwbs[i++] = 'B'; if (rw & REQ_SYNC) rwbs[i++] = 'S'; if (rw & REQ_META) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d88ce9b9fb8..f3dadae83883 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; + int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static DEFINE_MUTEX(mutex); static struct trace_seq s; unsigned long long avg; unsigned long long stddev; #endif + mutex_lock(&ftrace_profile_lock); + + /* we raced with function_profile_reset() */ + if (unlikely(rec->counter == 0)) { + ret = -EBUSY; + goto out; + } kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); @@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v) do_div(stddev, (rec->counter - 1) * 1000); } - mutex_lock(&mutex); trace_seq_init(&s); trace_print_graph_duration(rec->time, &s); trace_seq_puts(&s, " "); @@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v) trace_seq_puts(&s, " "); trace_print_graph_duration(stddev, &s); trace_print_seq(m, &s); - mutex_unlock(&mutex); #endif seq_putc(m, '\n'); +out: + mutex_unlock(&ftrace_profile_lock); - return 0; + return ret; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) @@ -793,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = { .open = tracing_open_generic, .read = ftrace_profile_read, .write = ftrace_profile_write, + .llseek = default_llseek, }; /* used to initialize the real stat files */ @@ -877,10 +885,8 @@ enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), - FTRACE_START_FUNC_RET = (1 << 5), - FTRACE_STOP_FUNC_RET = (1 << 6), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), }; static int ftrace_filtered; @@ -1219,8 +1225,6 @@ static void ftrace_shutdown(int command) static void ftrace_startup_sysctl(void) { - int command = FTRACE_ENABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; @@ -1228,23 +1232,17 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - command |= FTRACE_ENABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_ENABLE_CALLS); } static void ftrace_shutdown_sysctl(void) { - int command = FTRACE_DISABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ if (ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_DISABLE_CALLS); } static cycle_t ftrace_update_time; @@ -1361,24 +1359,29 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - struct ftrace_page *pg; - int hidx; - int idx; - unsigned flags; - struct trace_parser parser; + loff_t pos; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + int hidx; + int idx; + unsigned flags; }; static void * -t_hash_next(struct seq_file *m, void *v, loff_t *pos) +t_hash_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct hlist_node *hnd = v; + struct hlist_node *hnd = NULL; struct hlist_head *hhd; - WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); - (*pos)++; + iter->pos = *pos; + if (iter->probe) + hnd = &iter->probe->node; retry: if (iter->hidx >= FTRACE_FUNC_HASHSIZE) return NULL; @@ -1401,7 +1404,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) } } - return hnd; + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; } static void *t_hash_start(struct seq_file *m, loff_t *pos) @@ -1410,26 +1418,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_HASH)) - *pos = 0; - - iter->flags |= FTRACE_ITER_HASH; + if (iter->func_pos > *pos) + return NULL; iter->hidx = 0; - for (l = 0; l <= *pos; ) { - p = t_hash_next(m, p, &l); + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_hash_next(m, &l); if (!p) break; } - return p; + if (!p) + return NULL; + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_HASH; + + return iter; } -static int t_hash_show(struct seq_file *m, void *v) +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) { struct ftrace_func_probe *rec; - struct hlist_node *hnd = v; - rec = hlist_entry(hnd, struct ftrace_func_probe, node); + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); @@ -1450,12 +1464,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct dyn_ftrace *rec = NULL; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, v, pos); + return t_hash_next(m, pos); (*pos)++; + iter->pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) - return NULL; + return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -1484,7 +1499,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } - return rec; + if (!rec) + return t_hash_start(m, pos); + + iter->func_pos = *pos; + iter->func = rec; + + return iter; +} + +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -1495,6 +1523,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) mutex_lock(&ftrace_lock); /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + + /* * For set_ftrace_filter reading, if we have the filter * off, we can short cut and just print out that all * functions are enabled. @@ -1503,12 +1537,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; + /* reset in case of seek/pread */ + iter->flags &= ~FTRACE_ITER_HASH; return iter; } if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { @@ -1517,10 +1558,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p && iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); + if (!p) { + if (iter->flags & FTRACE_ITER_FILTER) + return t_hash_start(m, pos); - return p; + return NULL; + } + + return iter; } static void t_stop(struct seq_file *m, void *p) @@ -1531,16 +1576,18 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec = v; + struct dyn_ftrace *rec; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, v); + return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { seq_printf(m, "#### all functions enabled ####\n"); return 0; } + rec = iter->func; + if (!rec) return 0; @@ -1592,8 +1639,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) ret = ftrace_avail_open(inode, file); if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; + m = file->private_data; + iter = m->private; iter->flags = FTRACE_ITER_FAILURES; } @@ -2623,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = { .read = seq_read, .write = ftrace_graph_write, .release = ftrace_graph_release, + .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a06161..f55fcf61b223 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,5 +13,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); +#ifdef EVENT_POWER_TRACING_DEPRECATED +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +#endif +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 19cccc3c3028..bd1c35a4fbcc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -224,6 +224,9 @@ enum { RB_LEN_TIME_STAMP = 16, }; +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event) return length + RB_EVNT_HDR_SIZE; } -/* inline for ring buffer fast paths */ -static unsigned +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { @@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event) return 0; } +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - unsigned length = rb_event_length(event); + unsigned length; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; @@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) @@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -/* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) - int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + /** * ring_buffer_update_event - update event type and data * @event: the even to update @@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * data field. */ static void -rb_update_event(struct ring_buffer_event *event, - unsigned type, unsigned length) +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, unsigned length, + int add_timestamp, u64 delta) { - event->type_len = type; - - switch (type) { - - case RINGBUF_TYPE_PADDING: - case RINGBUF_TYPE_TIME_EXTEND: - case RINGBUF_TYPE_TIME_STAMP: - break; + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; - case 0: - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - event->array[0] = length; - else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); - break; - default: - BUG(); + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } /* @@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } -static struct ring_buffer_event * +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 *ts) + struct buffer_page *tail_page, u64 ts) { struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; @@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer); - next_page->page->time_stamp = *ts; + ts = rb_time_stamp(buffer); + next_page->page->time_stamp = ts; } out_again: @@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) + unsigned long length, u64 ts, + u64 delta, int add_timestamp) { struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(add_timestamp)) + length += RB_LEN_TIME_EXTEND; + tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) + if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, tail_page, ts); @@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(event, type, length); + rb_update_event(cpu_buffer, event, length, add_timestamp, delta); - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); + local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update * its timestamp. */ if (!tail) - tail_page->page->time_stamp = *ts; + tail_page->page->time_stamp = ts; return event; } @@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long addr; new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); + old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= PAGE_MASK; @@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static int -rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 *ts, u64 *delta) -{ - struct ring_buffer_event *event; - int ret; - - WARN_ONCE(*delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", - (unsigned long long)*delta, - (unsigned long long)*ts, - (unsigned long long)cpu_buffer->write_stamp); - - /* - * The delta is too big, we to add a - * new timestamp. - */ - event = __rb_reserve_next(cpu_buffer, - RINGBUF_TYPE_TIME_EXTEND, - RB_LEN_TIME_EXTEND, - ts); - if (!event) - return -EBUSY; - - if (PTR_ERR(event) == -EAGAIN) - return -EAGAIN; - - /* Only a commited time event can update the write stamp */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * If this is the first on the page, then it was - * updated with the page itself. Try to discard it - * and if we can't just make it zero. - */ - if (rb_event_index(event)) { - event->time_delta = *delta & TS_MASK; - event->array[0] = *delta >> TS_SHIFT; - } else { - /* try to discard, since we do not need this */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - } - cpu_buffer->write_stamp = *ts; - /* let the caller know this was the commit */ - ret = 1; - } else { - /* Try to discard the event */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - } - ret = 0; - } - - *delta = 0; - - return ret; -} - static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); local_inc(&cpu_buffer->commits); } -static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta = 0; - int commit = 0; + u64 ts, delta; int nr_loops = 0; + int add_timestamp; + u64 diff; rb_start_commit(cpu_buffer); @@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, length = rb_calculate_event_length(length); again: + add_timestamp = 0; + delta = 0; + /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer, goto out_fail; ts = rb_time_stamp(cpu_buffer->buffer); + diff = ts - cpu_buffer->write_stamp; - /* - * Only the first commit can update the timestamp. - * Yes there is a race here. If an interrupt comes in - * just after the conditional and it traces too, then it - * will also check the deltas. More than one timestamp may - * also be made. But only the entry that did the actual - * commit will be something other than zero. - */ - if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer))) { - u64 diff; - - diff = ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (unlikely(ts < cpu_buffer->write_stamp)) - goto get_event; + /* make sure this diff is calculated here */ + barrier(); + /* Did the write stamp get updated already? */ + if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { - - commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) - goto out_fail; - - if (commit == -EAGAIN) - goto again; - - RB_WARN_ON(cpu_buffer, commit < 0); + WARN_ONCE(delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + (unsigned long long)delta, + (unsigned long long)ts, + (unsigned long long)cpu_buffer->write_stamp); + add_timestamp = 1; } } - get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, &ts); + event = __rb_reserve_next(cpu_buffer, length, ts, + delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) goto out_fail; - if (!rb_event_is_commit(cpu_buffer, event)) - delta = 0; - - event->time_delta = delta; - return event; out_fail: @@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, #define TRACE_RECURSIVE_DEPTH 16 -static int trace_recursive_lock(void) +/* Keep this code out of the fast path cache */ +static noinline void trace_recursive_fail(void) { - current->trace_recursion++; - - if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) - return 0; - /* Disable all tracing before we do anything else */ tracing_off_permanent(); @@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void) in_nmi()); WARN_ON_ONCE(1); +} + +static inline int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + trace_recursive_fail(); + return -1; } -static void trace_recursive_unlock(void) +static inline void trace_recursive_unlock(void) { WARN_ON_ONCE(!current->trace_recursion); @@ -2308,12 +2298,28 @@ static void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { + u64 delta; + /* * The event first in the commit queue updates the * time stamp. */ - if (rb_event_is_commit(cpu_buffer, event)) - cpu_buffer->write_stamp += event->time_delta; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, @@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; @@ -2606,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer @@ -2614,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - - cpu_buffer->read; - return ret; + return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -2684,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += (local_read(&cpu_buffer->entries) - - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; + entries += rb_num_of_entries(cpu_buffer); } return entries; @@ -2985,13 +3003,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) static void rb_advance_iter(struct ring_buffer_iter *iter) { - struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; unsigned length; cpu_buffer = iter->cpu_buffer; - buffer = cpu_buffer->buffer; /* * Check if we are at the end of the buffer. @@ -3042,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written, or from discarded - * commits. The most that we can have is the number on a single page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -3123,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a timestamp is encountered. - * We can get multiple timestamps by nested interrupts or also - * if filtering is on (discarding commits). Since discarding - * commits can be frequent we can get a lot of timestamps. - * But we limit them by not adding timestamps if they begin - * at the start of a page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) @@ -3828,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (len > (commit - read)) len = (commit - read); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); if (len < size) goto out_unlock; @@ -3838,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* Need to copy one event at a time */ do { + /* We need the size of one event, because + * rb_advance_reader only advances by one event, + * whereas rb_event_ts_length may include the size of + * one or two events. + * We have already ensured there's enough space if this + * is a time extend. */ + size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; @@ -3850,8 +3872,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, break; event = rb_reader_event(cpu_buffer); - size = rb_event_length(event); - } while (len > size); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); @@ -3967,6 +3990,7 @@ static const struct file_operations rb_simple_fops = { .open = tracing_open_generic, .read = rb_simple_read, .write = rb_simple_write, + .llseek = default_llseek, }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..f8cf959bad45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,7 +17,6 @@ #include <linux/writeback.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> #include <linux/notifier.h> #include <linux/irqflags.h> #include <linux/debugfs.h> @@ -1284,6 +1283,8 @@ void trace_dump_stack(void) __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); } +static DEFINE_PER_CPU(int, user_stack_count); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -1302,6 +1303,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (unlikely(in_nmi())) return; + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) @@ -1319,6 +1332,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + __this_cpu_dec(user_stack_count); + + out: + preempt_enable(); } #ifdef UNUSED @@ -2196,7 +2214,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; @@ -2320,11 +2338,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } +static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +{ + if (file->f_mode & FMODE_READ) + return seq_lseek(file, offset, origin); + else + return 0; +} + static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .write = tracing_write_stub, - .llseek = seq_lseek, + .llseek = tracing_seek, .release = tracing_release, }; @@ -3996,13 +4022,9 @@ static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); struct dentry *d_cpu; - /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ - char cpu_dir[7]; - - if (cpu > 999 || cpu < 0) - return; + char cpu_dir[30]; /* 30 characters should be more than enough */ - sprintf(cpu_dir, "cpu%ld", cpu); + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a5..9021f8c0c0c3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 000e6e85b445..19a359d5e6d5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,7 +9,7 @@ #include <linux/kprobes.h> #include "trace.h" -static char *perf_trace_buf[4]; +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; +static int perf_trace_event_perm(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + /* No tracing, just counting, so no obvious leak */ + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + return 0; + + /* Some events are ok to be traced by non-root users... */ + if (p_event->attach_state == PERF_ATTACH_TASK) { + if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) + return 0; + } + + /* + * ...otherwise raw tracepoint data can be a severe data leak, + * only allow root to have these. + */ + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return 0; +} + static int perf_trace_event_init(struct ftrace_event_call *tp_event, struct perf_event *p_event) { - struct hlist_head *list; - int ret = -ENOMEM; + struct hlist_head __percpu *list; + int ret; int cpu; + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; + ret = -ENOMEM; + list = alloc_percpu(struct hlist_head); if (!list) goto fail; @@ -42,11 +71,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, tp_event->perf_events = list; if (!total_ref_count) { - char *buf; + char __percpu *buf; int i; - for (i = 0; i < 4; i++) { - buf = (char *)alloc_percpu(perf_trace_t); + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + buf = (char __percpu *)alloc_percpu(perf_trace_t); if (!buf) goto fail; @@ -65,7 +94,7 @@ fail: if (!total_ref_count) { int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } @@ -91,6 +120,8 @@ int perf_trace_init(struct perf_event *p_event) tp_event->class && tp_event->class->reg && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); + if (ret) + module_put(tp_event->mod); break; } } @@ -99,22 +130,26 @@ int perf_trace_init(struct perf_event *p_event) return ret; } -int perf_trace_enable(struct perf_event *p_event) +int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; struct hlist_head *list; - list = tp_event->perf_events; - if (WARN_ON_ONCE(!list)) + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; - list = this_cpu_ptr(list); + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + + list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; } -void perf_trace_disable(struct perf_event *p_event) +void perf_trace_del(struct perf_event *p_event, int flags) { hlist_del_rcu(&p_event->hlist_entry); } @@ -140,12 +175,13 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->perf_events = NULL; if (!--total_ref_count) { - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } } out: + module_put(tp_event->mod); mutex_unlock(&event_mutex); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..35fde09b81de 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,6 +27,12 @@ DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_storage_mutex); +EXPORT_SYMBOL_GPL(event_storage_mutex); + +char event_storage[EVENT_STORAGE_SIZE]; +EXPORT_SYMBOL_GPL(event_storage); + LIST_HEAD(ftrace_events); LIST_HEAD(ftrace_common_fields); @@ -600,21 +606,29 @@ out: enum { FORMAT_HEADER = 1, - FORMAT_PRINTFMT = 2, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_event_call *call = m->private; struct ftrace_event_field *field; - struct list_head *head; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - head = &ftrace_common_fields; + if (unlikely(list_empty(common_head))) + return NULL; + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; + + case FORMAT_FIELD_SEPERATOR: if (unlikely(list_empty(head))) return NULL; @@ -626,31 +640,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) return NULL; } - head = trace_get_fields(call); - - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it in case. - */ - v = (void *)((unsigned long)v & ~1L); - field = v; - /* - * If this is a common field, and at the end of the list, then - * continue with main list. - */ - if (field->link.prev == &ftrace_common_fields) { - if (unlikely(list_empty(head))) - return NULL; - field = list_entry(head->prev, struct ftrace_event_field, link); - /* Set the LSB to notify f_show to print an extra newline */ - field = (struct ftrace_event_field *) - ((unsigned long)field | 1); - return field; - } - - /* If we are done tell f_show to print the format */ - if (field->link.prev == head) + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) return (void *)FORMAT_PRINTFMT; field = list_entry(field->link.prev, struct ftrace_event_field, link); @@ -688,22 +681,16 @@ static int f_show(struct seq_file *m, void *v) seq_printf(m, "format:\n"); return 0; + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it and - * print a newline if it is set. - */ - if ((unsigned long)v & 1) { - seq_putc(m, '\n'); - v = (void *)((unsigned long)v & ~1L); - } - field = v; /* @@ -951,6 +938,7 @@ static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, .write = event_enable_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_event_format_fops = { @@ -963,29 +951,34 @@ static const struct file_operations ftrace_event_format_fops = { static const struct file_operations ftrace_event_id_fops = { .open = tracing_open_generic, .read = event_id_read, + .llseek = default_llseek, }; static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_generic, .read = event_filter_read, .write = event_filter_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_subsystem_filter_fops = { .open = tracing_open_generic, .read = subsystem_filter_read, .write = subsystem_filter_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_system_enable_fops = { .open = tracing_open_generic, .read = system_enable_read, .write = system_enable_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, + .llseek = default_llseek, }; static struct dentry *event_trace_events_dir(void) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac25..4b74d71705c0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + do { \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + mutex_lock(&event_storage_mutex); \ + snprintf(event_storage, sizeof(event_storage), \ + "%s[%d]", #type, len); \ + ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; + mutex_unlock(&event_storage_mutex); \ + if (ret) \ + return ret; \ + } while (0); #undef __array_desc #define __array_desc(type, container, item, len) \ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6f233698518e..76b05980225c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -15,15 +15,19 @@ #include "trace.h" #include "trace_output.h" +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + struct fgraph_cpu_data { pid_t last_pid; int depth; + int depth_irq; int ignore; unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; struct fgraph_data { - struct fgraph_cpu_data *cpu_data; + struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ struct ftrace_graph_ent_entry ent; @@ -41,6 +45,7 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, /* Display absolute time of an entry */ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns and proc by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts }; @@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, return 1; } +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs) + return 0; + + return in_irq(); +} + int trace_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = graph_array; @@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func))) + if (!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) return 0; local_irq_save(flags); @@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, ip, flags, pc); +} + void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", - nsecs_rem); + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return 0; } +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just extered irq code + * + * retunns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) @@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t ret; int cpu = iter->cpu; + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) return TRACE_TYPE_PARTIAL_LINE; @@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; int i; + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + if (data) { struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; @@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function_flags(struct trace_iterator *iter, u32 flags) +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); } static enum print_line_t @@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s|||| / \n", size, spaces); } -void print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) { int lat = trace_flags & TRACE_ITER_LATENCY_FMT; @@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) print_graph_headers_flags(s, tracer_flags.val); } +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ @@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter) pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + *pid = -1; *depth = 0; *ignore = 0; + *depth_irq = -1; } iter->private = data; @@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter) } } +static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + return 0; +} + static struct trace_event_functions graph_functions = { .trace = print_graph_function_event, }; @@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = { .print_line = print_graph_function, .print_header = print_graph_headers, .flags = &tracer_flags, + .set_flag = func_graph_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function_graph, #endif diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2e..5cf8c602b880 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) { - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; @@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) */ cpu = raw_smp_processor_id(); if (likely(!per_cpu(tracing_cpu, cpu))) - return; + return 0; - local_save_flags(flags); + local_save_flags(*flags); /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; + if (!irqs_disabled_flags(*flags)) + return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, preempt_count()); + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; int ret; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return 0; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return 0; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else - ret = 0; - + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); atomic_dec(&data->disabled); + return ret; } @@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); atomic_dec(&data->disabled); } @@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - /* * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ if (is_graph()) - return print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; } static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) { - struct trace_iterator *iter = s->private; - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - print_graph_headers_flags(s, flags); - } else + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else trace_default_header(s); } static void -trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) -{ - u64 time = trace_clock_local(); - struct ftrace_graph_ent ent = { - .func = ip, - .depth = 0, - }; - struct ftrace_graph_ret ret = { - .func = ip, - .depth = 0, - .calltime = time, - .rettime = time, - }; - - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); -} - -static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (!is_graph()) + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else trace_function(tr, ip, parent_ip, flags, pc); - else { - trace_graph_function(tr, parent_ip, flags, pc); - trace_graph_function(tr, ip, flags, pc); - } } #else diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 7b8ecd751d93..3c5c5dfea0b3 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -13,7 +13,6 @@ #include <linux/kdb.h> #include <linux/ftrace.h> -#include "../debug/kdb/kdb_private.h" #include "trace.h" #include "trace_output.h" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8b27c9849b42..2dec9bcde8b4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -31,7 +31,6 @@ #include <linux/perf_event.h> #include <linux/stringify.h> #include <linux/limits.h> -#include <linux/uaccess.h> #include <asm/bitsperlong.h> #include "trace.h" @@ -514,8 +513,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Check the name is good for event/group */ -static int check_event_name(const char *name) +/* Check the name is good for event/group/fields */ +static int is_good_name(const char *name) { if (!isalpha(*name) && *name != '_') return 0; @@ -557,7 +556,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, else tp->rp.kp.pre_handler = kprobe_dispatcher; - if (!event || !check_event_name(event)) { + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; } @@ -567,7 +566,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, if (!tp->call.name) goto error; - if (!group || !check_event_name(group)) { + if (!group || !is_good_name(group)) { ret = -EINVAL; goto error; } @@ -648,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp) } ret = register_probe_event(tp); if (ret) { - pr_warning("Faild to register probe event(%d)\n", ret); + pr_warning("Failed to register probe event(%d)\n", ret); goto end; } @@ -883,7 +882,7 @@ static int create_trace_probe(int argc, char **argv) int i, ret = 0; int is_return = 0, is_delete = 0; char *symbol = NULL, *event = NULL, *group = NULL; - char *arg, *tmp; + char *arg; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -992,26 +991,36 @@ static int create_trace_probe(int argc, char **argv) /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Increment count for freeing args in error case */ + tp->nr_args++; + /* Parse argument name */ arg = strchr(argv[i], '='); - if (arg) + if (arg) { *arg++ = '\0'; - else + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + } else { arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + tp->args[i].name = kstrdup(buf, GFP_KERNEL); + } - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); if (!tp->args[i].name) { - pr_info("Failed to allocate argument%d name '%s'.\n", - i, argv[i]); + pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - tmp = strchr(tp->args[i].name, ':'); - if (tmp) - *tmp = '_'; /* convert : to _ */ + + if (!is_good_name(tp->args[i].name)) { + pr_info("Invalid argument[%d] name: %s\n", + i, tp->args[i].name); + ret = -EINVAL; + goto error; + } if (conflict_field_name(tp->args[i].name, tp->args, i)) { - pr_info("Argument%d name '%s' conflicts with " + pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; goto error; @@ -1020,12 +1029,9 @@ static int create_trace_probe(int argc, char **argv) /* Parse fetch argument */ ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { - pr_info("Parse error at argument%d. (%d)\n", i, ret); - kfree(tp->args[i].name); + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; } - - tp->nr_args++; } ret = register_trace_probe(tp); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81b..7319559ed59f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,48 +31,98 @@ static int wakeup_rt; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_lat_flag; +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + #ifdef CONFIG_FUNCTION_TRACER + /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) { - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; - int pc; if (likely(!wakeup_task)) - return; + return 0; - pc = preempt_count(); + *pc = preempt_count(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) goto out_enable; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; - local_irq_save(flags); + return 1; - trace_function(tr, ip, parent_ip, flags, pc); +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); local_irq_restore(flags); - out: atomic_dec(&data->disabled); - out_enable: preempt_enable_notrace(); } @@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc, ret = 0; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return 0; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore, /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; - trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); T0 = data->preempt_timestamp; @@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: arch_spin_unlock(&wakeup_lock); @@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - register_ftrace_function(&trace_ops); - - if (tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); return; fail_deprobe_wake_new: @@ -320,7 +516,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + stop_func_tracer(is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; @@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b3209..562c56e048fd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) static int trace_wakeup_test_thread(void *data) { /* Make this a RT thread, doesn't need to be too high */ - struct sched_param param = { .sched_priority = 5 }; + static struct sched_param param = { .sched_priority = 5 }; struct completion *x = data; sched_setscheduler(current, SCHED_FIFO, ¶m); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a6b7e0e0f3eb..4c5dead0c239 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = { .open = tracing_open_generic, .read = stack_max_size_read, .write = stack_max_size_write, + .llseek = default_llseek, }; static void * diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc3793baf6..209b379a4721 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) { int ret, cpu; + for_each_possible_cpu(cpu) { + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); + } + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); if (ret) goto out; @@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) if (ret) goto no_creation; - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - return 0; no_creation: |