summaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig18
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c88
-rw-r--r--kernel/trace/ring_buffer.c108
-rw-r--r--kernel/trace/trace.c253
-rw-r--r--kernel/trace/trace.h134
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_functions.c61
-rw-r--r--kernel/trace/trace_functions_graph.c68
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_selftest.c21
-rw-r--r--kernel/trace/trace_syscalls.c18
-rw-r--r--kernel/trace/trace_uprobe.c217
15 files changed, 728 insertions, 269 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ad0a067ad4b3..192473b22799 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
help
See Documentation/trace/ftrace-design.txt
+config HAVE_DYNAMIC_FTRACE_WITH_REGS
+ bool
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -235,6 +238,16 @@ config FTRACE_SYSCALLS
help
Basic tracer to catch the syscall entry and exit events.
+config TRACER_SNAPSHOT
+ bool "Create a snapshot trace buffer"
+ select TRACER_MAX_TRACE
+ help
+ Allow tracing users to take snapshot of the current buffer using the
+ ftrace interface, e.g.:
+
+ echo 1 > /sys/kernel/debug/tracing/snapshot
+ cat snapshot
+
config TRACE_BRANCH_PROFILING
bool
select GENERIC_TRACER
@@ -419,6 +432,11 @@ config DYNAMIC_FTRACE
were made. If so, it runs stop_machine (stops all CPUS)
and modifies the code to jump over the call to ftrace.
+config DYNAMIC_FTRACE_WITH_REGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741c..71259e2b6b61 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
return;
local_irq_save(flags);
- buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+ buf = this_cpu_ptr(bt->msg_data);
va_start(args, fmt);
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 41473b4ad7a4..ce8c3d68292f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
#endif
+/*
+ * Traverse the ftrace_global_list, invoking all entries. The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism. The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_global_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
+#define do_for_each_ftrace_op(op, list) \
+ op = rcu_dereference_raw(list); \
+ do
+
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op) \
+ while (likely(op = rcu_dereference_raw((op)->next)) && \
+ unlikely((op) != &ftrace_list_end))
+
/**
* ftrace_nr_registered_ops - return number of ops registered
*
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
return cnt;
}
-/*
- * Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
- * are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
- *
- * Silly Alpha and silly pointer-speculation compiler optimizations!
- */
static void
ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
- if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+ int bit;
+
+ bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
+ if (bit < 0)
return;
- trace_recursion_set(TRACE_GLOBAL_BIT);
- op = rcu_dereference_raw(ftrace_global_list); /*see above*/
- while (op != &ftrace_list_end) {
+ do_for_each_ftrace_op(op, ftrace_global_list) {
op->func(ip, parent_ip, op, regs);
- op = rcu_dereference_raw(op->next); /*see above*/
- };
- trace_recursion_clear(TRACE_GLOBAL_BIT);
+ } while_for_each_ftrace_op(op);
+
+ trace_clear_recursion(bit);
}
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
* registered callers.
*/
if (ftrace_global_list == &ftrace_list_end ||
- ftrace_global_list->next == &ftrace_list_end)
+ ftrace_global_list->next == &ftrace_list_end) {
func = ftrace_global_list->func;
- else
+ /*
+ * As we are calling the function directly.
+ * If it does not have recursion protection,
+ * the function_trace_op needs to be updated
+ * accordingly.
+ */
+ if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
+ global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+ else
+ global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
+ } else {
func = ftrace_global_list_func;
+ /* The list has its own recursion protection. */
+ global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+ }
+
/* If we filter on pids, update to use the pid function */
if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
return -EINVAL;
-#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
/*
* If the ftrace_ops specifies SAVE_REGS, then it only can be used
* if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -4090,14 +4116,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
*/
preempt_disable_notrace();
trace_recursion_set(TRACE_CONTROL_BIT);
- op = rcu_dereference_raw(ftrace_control_list);
- while (op != &ftrace_list_end) {
+ do_for_each_ftrace_op(op, ftrace_control_list) {
if (!ftrace_function_local_disabled(op) &&
ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs);
-
- op = rcu_dereference_raw(op->next);
- };
+ } while_for_each_ftrace_op(op);
trace_recursion_clear(TRACE_CONTROL_BIT);
preempt_enable_notrace();
}
@@ -4112,27 +4135,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
struct ftrace_ops *op;
+ int bit;
if (function_trace_stop)
return;
- if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+ bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ if (bit < 0)
return;
- trace_recursion_set(TRACE_INTERNAL_BIT);
/*
* Some of the ops may be dynamically allocated,
* they must be freed after a synchronize_sched().
*/
preempt_disable_notrace();
- op = rcu_dereference_raw(ftrace_ops_list);
- while (op != &ftrace_list_end) {
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
if (ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs);
- op = rcu_dereference_raw(op->next);
- };
+ } while_for_each_ftrace_op(op);
preempt_enable_notrace();
- trace_recursion_clear(TRACE_INTERNAL_BIT);
+ trace_clear_recursion(bit);
}
/*
@@ -4143,8 +4165,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* Archs are to support both the regs and ftrace_ops at the same time.
* If they support ftrace_ops, it is assumed they support regs.
* If call backs want to use regs, they must either check for regs
- * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
- * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
+ * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
+ * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
* set the ARCH_SUPPORT_FTARCE_OPS.
*/
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedcd..7244acde77b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ftrace_event.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
+#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
@@ -21,7 +23,6 @@
#include <linux/fs.h>
#include <asm/local.h>
-#include "trace.h"
static void update_pages_handler(struct work_struct *work);
@@ -2432,41 +2433,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
#ifdef CONFIG_TRACING
-#define TRACE_RECURSIVE_DEPTH 16
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
+ *
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+static DEFINE_PER_CPU(unsigned int, current_context);
-/* Keep this code out of the fast path cache */
-static noinline void trace_recursive_fail(void)
+static __always_inline int trace_recursive_lock(void)
{
- /* Disable all tracing before we do anything else */
- tracing_off_permanent();
-
- printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
- "HC[%lu]:SC[%lu]:NMI[%lu]\n",
- trace_recursion_buffer(),
- hardirq_count() >> HARDIRQ_SHIFT,
- softirq_count() >> SOFTIRQ_SHIFT,
- in_nmi());
-
- WARN_ON_ONCE(1);
-}
+ unsigned int val = this_cpu_read(current_context);
+ int bit;
-static inline int trace_recursive_lock(void)
-{
- trace_recursion_inc();
+ if (in_interrupt()) {
+ if (in_nmi())
+ bit = 0;
+ else if (in_irq())
+ bit = 1;
+ else
+ bit = 2;
+ } else
+ bit = 3;
- if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
- return 0;
+ if (unlikely(val & (1 << bit)))
+ return 1;
- trace_recursive_fail();
+ val |= (1 << bit);
+ this_cpu_write(current_context, val);
- return -1;
+ return 0;
}
-static inline void trace_recursive_unlock(void)
+static __always_inline void trace_recursive_unlock(void)
{
- WARN_ON_ONCE(!trace_recursion_buffer());
+ unsigned int val = this_cpu_read(current_context);
- trace_recursion_dec();
+ val--;
+ val &= this_cpu_read(current_context);
+ this_cpu_write(current_context, val);
}
#else
@@ -3067,6 +3103,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
/**
+ * ring_buffer_read_events_cpu - get the number of events successfully read
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of events read
+ */
+unsigned long
+ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->read;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
+
+/**
* ring_buffer_entries - get the number of entries in a buffer
* @buffer: The ring buffer
*
@@ -3425,7 +3479,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
/* check for end of page padding */
if ((iter->head >= rb_page_size(iter->head_page)) &&
(iter->head_page != cpu_buffer->commit_page))
- rb_advance_iter(iter);
+ rb_inc_iter(iter);
}
static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46d7d24..c2e2c2310374 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
+#include <linux/sched/rt.h>
#include "trace.h"
#include "trace_output.h"
@@ -249,7 +250,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
static struct tracer *trace_types __read_mostly;
/* current_trace points to the tracer that is currently active */
-static struct tracer *current_trace __read_mostly;
+static struct tracer *current_trace __read_mostly = &nop_trace;
/*
* trace_types_lock is used to protect the trace_types list.
@@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (!current_trace->use_max_tr) {
- WARN_ON_ONCE(1);
+
+ if (!current_trace->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(current_trace != &nop_trace);
return;
}
+
arch_spin_lock(&ftrace_max_lock);
tr->buffer = max_tr.buffer;
@@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (!current_trace->use_max_tr) {
- WARN_ON_ONCE(1);
+ if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
return;
- }
arch_spin_lock(&ftrace_max_lock);
@@ -862,10 +864,13 @@ int register_tracer(struct tracer *type)
current_trace = type;
- /* If we expanded the buffers, make sure the max is expanded too */
- if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, trace_buf_size,
- RING_BUFFER_ALL_CPUS);
+ if (type->use_max_tr) {
+ /* If we expanded the buffers, make sure the max is expanded too */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(max_tr.buffer, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
+ type->allocated_snapshot = true;
+ }
/* the test is responsible for initializing and enabling */
pr_info("Testing tracer %s: ", type->name);
@@ -881,10 +886,14 @@ int register_tracer(struct tracer *type)
/* Only reset on passing, to avoid touching corrupted buffers */
tracing_reset_online_cpus(tr);
- /* Shrink the max buffer again */
- if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, 1,
- RING_BUFFER_ALL_CPUS);
+ if (type->use_max_tr) {
+ type->allocated_snapshot = false;
+
+ /* Shrink the max buffer again */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(max_tr.buffer, 1,
+ RING_BUFFER_ALL_CPUS);
+ }
printk(KERN_CONT "PASSED\n");
}
@@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
{
struct ring_buffer *buffer = tr->buffer;
+ if (!buffer)
+ return;
+
ring_buffer_record_disable(buffer);
/* Make sure all commits have finished */
@@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
struct ring_buffer *buffer = tr->buffer;
int cpu;
+ if (!buffer)
+ return;
+
ring_buffer_record_disable(buffer);
/* Make sure all commits have finished */
@@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
- entry->padding = 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
preempt_disable_notrace();
- use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+ use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
/*
* We don't need any atomic variables, just a barrier.
* If an interrupt comes in, we don't care, because it would
@@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
out:
/* Again, don't let gcc optimize things here */
barrier();
- __get_cpu_var(ftrace_stack_reserve)--;
+ __this_cpu_dec(ftrace_stack_reserve);
preempt_enable_notrace();
}
@@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
static char *get_trace_buf(void)
{
struct trace_buffer_struct *percpu_buffer;
- struct trace_buffer_struct *buffer;
/*
* If we have allocated per cpu buffers, then we do not
@@ -1535,9 +1548,7 @@ static char *get_trace_buf(void)
if (!percpu_buffer)
return NULL;
- buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
-
- return buffer->buffer;
+ return this_cpu_ptr(&percpu_buffer->buffer[0]);
}
static int alloc_percpu_trace_buffer(void)
@@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
static void *s_start(struct seq_file *m, loff_t *pos)
{
struct trace_iterator *iter = m->private;
- static struct tracer *old_tracer;
int cpu_file = iter->cpu_file;
void *p = NULL;
loff_t l = 0;
int cpu;
- /* copy the tracer to avoid using a global lock all around */
+ /*
+ * copy the tracer to avoid using a global lock all around.
+ * iter->trace is a copy of current_trace, the pointer to the
+ * name may be used instead of a strcmp(), as iter->trace->name
+ * will point to the same string as current_trace->name.
+ */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
+ if (unlikely(current_trace && iter->trace->name != current_trace->name))
*iter->trace = *current_trace;
- }
mutex_unlock(&trace_types_lock);
- atomic_inc(&trace_record_cmdline_disabled);
+ if (iter->snapshot && iter->trace->use_max_tr)
+ return ERR_PTR(-EBUSY);
+
+ if (!iter->snapshot)
+ atomic_inc(&trace_record_cmdline_disabled);
if (*pos != iter->pos) {
iter->ent = NULL;
@@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p)
{
struct trace_iterator *iter = m->private;
- atomic_dec(&trace_record_cmdline_disabled);
+ if (iter->snapshot && iter->trace->use_max_tr)
+ return;
+
+ if (!iter->snapshot)
+ atomic_dec(&trace_record_cmdline_disabled);
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
@@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
unsigned long total;
const char *name = "preemption";
- if (type)
- name = type->name;
+ name = type->name;
get_total_entries(tr, &total, &entries);
@@ -2430,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = {
};
static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file)
+__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
long cpu_file = (long) inode->i_private;
struct trace_iterator *iter;
@@ -2457,16 +2477,16 @@ __tracing_open(struct inode *inode, struct file *file)
if (!iter->trace)
goto fail;
- if (current_trace)
- *iter->trace = *current_trace;
+ *iter->trace = *current_trace;
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
- if (current_trace && current_trace->print_max)
+ if (current_trace->print_max || snapshot)
iter->tr = &max_tr;
else
iter->tr = &global_trace;
+ iter->snapshot = snapshot;
iter->pos = -1;
mutex_init(&iter->mutex);
iter->cpu_file = cpu_file;
@@ -2483,8 +2503,9 @@ __tracing_open(struct inode *inode, struct file *file)
if (trace_clocks[trace_clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
- /* stop the trace while dumping */
- tracing_stop();
+ /* stop the trace while dumping if we are not opening "snapshot" */
+ if (!iter->snapshot)
+ tracing_stop();
if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
for_each_tracing_cpu(cpu) {
@@ -2547,8 +2568,9 @@ static int tracing_release(struct inode *inode, struct file *file)
if (iter->trace && iter->trace->close)
iter->trace->close(iter);
- /* reenable tracing if it was previously enabled */
- tracing_start();
+ if (!iter->snapshot)
+ /* reenable tracing if it was previously enabled */
+ tracing_start();
mutex_unlock(&trace_types_lock);
mutex_destroy(&iter->mutex);
@@ -2576,7 +2598,7 @@ static int tracing_open(struct inode *inode, struct file *file)
}
if (file->f_mode & FMODE_READ) {
- iter = __tracing_open(inode, file);
+ iter = __tracing_open(inode, file, false);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -3014,10 +3036,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
int r;
mutex_lock(&trace_types_lock);
- if (current_trace)
- r = sprintf(buf, "%s\n", current_trace->name);
- else
- r = sprintf(buf, "\n");
+ r = sprintf(buf, "%s\n", current_trace->name);
mutex_unlock(&trace_types_lock);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3183,6 +3202,7 @@ static int tracing_set_tracer(const char *buf)
static struct trace_option_dentry *topts;
struct trace_array *tr = &global_trace;
struct tracer *t;
+ bool had_max_tr;
int ret = 0;
mutex_lock(&trace_types_lock);
@@ -3207,9 +3227,21 @@ static int tracing_set_tracer(const char *buf)
goto out;
trace_branch_disable();
- if (current_trace && current_trace->reset)
+ if (current_trace->reset)
current_trace->reset(tr);
- if (current_trace && current_trace->use_max_tr) {
+
+ had_max_tr = current_trace->allocated_snapshot;
+ current_trace = &nop_trace;
+
+ if (had_max_tr && !t->use_max_tr) {
+ /*
+ * We need to make sure that the update_max_tr sees that
+ * current_trace changed to nop_trace to keep it from
+ * swapping the buffers after we resize it.
+ * The update_max_tr is called from interrupts disabled
+ * so a synchronized_sched() is sufficient.
+ */
+ synchronize_sched();
/*
* We don't free the ring buffer. instead, resize it because
* The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3217,18 +3249,19 @@ static int tracing_set_tracer(const char *buf)
*/
ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(&max_tr, 1);
+ tracing_reset_online_cpus(&max_tr);
+ current_trace->allocated_snapshot = false;
}
destroy_trace_option_files(topts);
- current_trace = &nop_trace;
-
topts = create_trace_option_files(t);
- if (t->use_max_tr) {
+ if (t->use_max_tr && !had_max_tr) {
/* we need to make per cpu buffer sizes equivalent */
ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
goto out;
+ t->allocated_snapshot = true;
}
if (t->init) {
@@ -3336,8 +3369,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
ret = -ENOMEM;
goto fail;
}
- if (current_trace)
- *iter->trace = *current_trace;
+ *iter->trace = *current_trace;
if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
ret = -ENOMEM;
@@ -3477,7 +3509,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_iterator *iter = filp->private_data;
- static struct tracer *old_tracer;
ssize_t sret;
/* return any leftover data */
@@ -3489,10 +3520,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
+ if (unlikely(iter->trace->name != current_trace->name))
*iter->trace = *current_trace;
- }
mutex_unlock(&trace_types_lock);
/*
@@ -3648,7 +3677,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
- static struct tracer *old_tracer;
ssize_t ret;
size_t rem;
unsigned int i;
@@ -3658,10 +3686,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
+ if (unlikely(iter->trace->name != current_trace->name))
*iter->trace = *current_trace;
- }
mutex_unlock(&trace_types_lock);
mutex_lock(&iter->mutex);
@@ -4037,8 +4063,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
* Reset the buffer so that it doesn't have incomparable timestamps.
*/
tracing_reset_online_cpus(&global_trace);
- if (max_tr.buffer)
- tracing_reset_online_cpus(&max_tr);
+ tracing_reset_online_cpus(&max_tr);
mutex_unlock(&trace_types_lock);
@@ -4054,6 +4079,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
return single_open(file, tracing_clock_show, NULL);
}
+#ifdef CONFIG_TRACER_SNAPSHOT
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+ struct trace_iterator *iter;
+ int ret = 0;
+
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file, true);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ }
+ return ret;
+}
+
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ mutex_lock(&trace_types_lock);
+
+ if (current_trace->use_max_tr) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ switch (val) {
+ case 0:
+ if (current_trace->allocated_snapshot) {
+ /* free spare buffer */
+ ring_buffer_resize(max_tr.buffer, 1,
+ RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&max_tr, 1);
+ tracing_reset_online_cpus(&max_tr);
+ current_trace->allocated_snapshot = false;
+ }
+ break;
+ case 1:
+ if (!current_trace->allocated_snapshot) {
+ /* allocate spare buffer */
+ ret = resize_buffer_duplicate_size(&max_tr,
+ &global_trace, RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ break;
+ current_trace->allocated_snapshot = true;
+ }
+
+ local_irq_disable();
+ /* Now, we're going to swap */
+ update_max_tr(&global_trace, current, smp_processor_id());
+ local_irq_enable();
+ break;
+ default:
+ if (current_trace->allocated_snapshot)
+ tracing_reset_online_cpus(&max_tr);
+ else
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret >= 0) {
+ *ppos += cnt;
+ ret = cnt;
+ }
+out:
+ mutex_unlock(&trace_types_lock);
+ return ret;
+}
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -4110,6 +4216,16 @@ static const struct file_operations trace_clock_fops = {
.write = tracing_clock_write,
};
+#ifdef CONFIG_TRACER_SNAPSHOT
+static const struct file_operations snapshot_fops = {
+ .open = tracing_snapshot_open,
+ .read = seq_read,
+ .write = tracing_snapshot_write,
+ .llseek = tracing_seek,
+ .release = tracing_release,
+};
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
struct ftrace_buffer_info {
struct trace_array *tr;
void *spare;
@@ -4414,6 +4530,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
trace_seq_printf(s, "dropped events: %ld\n", cnt);
+ cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "read events: %ld\n", cnt);
+
count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
kfree(s);
@@ -4490,7 +4609,7 @@ struct dentry *tracing_init_dentry(void)
static struct dentry *d_percpu;
-struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(void)
{
static int once;
struct dentry *d_tracer;
@@ -4906,6 +5025,11 @@ static __init int tracer_init_debugfs(void)
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ trace_create_file("snapshot", 0644, d_tracer,
+ (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
+#endif
+
create_trace_options_dir();
for_each_tracing_cpu(cpu)
@@ -5014,6 +5138,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
if (disable_tracing)
ftrace_kill();
+ /* Simulate the iterator */
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
@@ -5025,10 +5150,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
/* don't look at user memory in panic mode */
trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
- /* Simulate the iterator */
- iter.tr = &global_trace;
- iter.trace = current_trace;
-
switch (oops_dump_mode) {
case DUMP_ALL:
iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5173,7 +5294,7 @@ __init static int tracer_alloc_buffers(void)
init_irq_work(&trace_work_wakeup, trace_wake_up);
register_tracer(&nop_trace);
- current_trace = &nop_trace;
+
/* All seems OK, enable tracing */
tracing_disabled = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902c..57d7e5397d56 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -287,20 +287,62 @@ struct tracer {
struct tracer_flags *flags;
bool print_max;
bool use_max_tr;
+ bool allocated_snapshot;
};
/* Only current can touch trace_recursion */
-#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
-#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
-/* Ring buffer has the 10 LSB bits to count */
-#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
-
-/* for function tracing recursion */
-#define TRACE_INTERNAL_BIT (1<<11)
-#define TRACE_GLOBAL_BIT (1<<12)
-#define TRACE_CONTROL_BIT (1<<13)
+/*
+ * For function tracing recursion:
+ * The order of these bits are important.
+ *
+ * When function tracing occurs, the following steps are made:
+ * If arch does not support a ftrace feature:
+ * call internal function (uses INTERNAL bits) which calls...
+ * If callback is registered to the "global" list, the list
+ * function is called and recursion checks the GLOBAL bits.
+ * then this function calls...
+ * The function callback, which can use the FTRACE bits to
+ * check for recursion.
+ *
+ * Now if the arch does not suppport a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+ TRACE_BUFFER_BIT,
+ TRACE_BUFFER_NMI_BIT,
+ TRACE_BUFFER_IRQ_BIT,
+ TRACE_BUFFER_SIRQ_BIT,
+
+ /* Start of function recursion bits */
+ TRACE_FTRACE_BIT,
+ TRACE_FTRACE_NMI_BIT,
+ TRACE_FTRACE_IRQ_BIT,
+ TRACE_FTRACE_SIRQ_BIT,
+
+ /* GLOBAL_BITs must be greater than FTRACE_BITs */
+ TRACE_GLOBAL_BIT,
+ TRACE_GLOBAL_NMI_BIT,
+ TRACE_GLOBAL_IRQ_BIT,
+ TRACE_GLOBAL_SIRQ_BIT,
+
+ /* INTERNAL_BITs must be greater than GLOBAL_BITs */
+ TRACE_INTERNAL_BIT,
+ TRACE_INTERNAL_NMI_BIT,
+ TRACE_INTERNAL_IRQ_BIT,
+ TRACE_INTERNAL_SIRQ_BIT,
+
+ TRACE_CONTROL_BIT,
/*
* Abuse of the trace_recursion.
@@ -309,11 +351,77 @@ struct tracer {
* was called in irq context but we have irq tracing off. Since this
* can only be modified by current, we can reuse trace_recursion.
*/
-#define TRACE_IRQ_BIT (1<<13)
+ TRACE_IRQ_BIT,
+};
+
+#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
+
+#define TRACE_CONTEXT_BITS 4
+
+#define TRACE_FTRACE_START TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
+#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_LIST_START TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
+
+static __always_inline int trace_get_context_bit(void)
+{
+ int bit;
-#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
-#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
-#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
+ if (in_interrupt()) {
+ if (in_nmi())
+ bit = 0;
+
+ else if (in_irq())
+ bit = 1;
+ else
+ bit = 2;
+ } else
+ bit = 3;
+
+ return bit;
+}
+
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+ unsigned int val = current->trace_recursion;
+ int bit;
+
+ /* A previous recursion check was made */
+ if ((val & TRACE_CONTEXT_MASK) > max)
+ return 0;
+
+ bit = trace_get_context_bit() + start;
+ if (unlikely(val & (1 << bit)))
+ return -1;
+
+ val |= 1 << bit;
+ current->trace_recursion = val;
+ barrier();
+
+ return bit;
+}
+
+static __always_inline void trace_clear_recursion(int bit)
+{
+ unsigned int val = current->trace_recursion;
+
+ if (!bit)
+ return;
+
+ bit = 1 << bit;
+ val &= ~bit;
+
+ barrier();
+ current->trace_recursion = val;
+}
#define TRACE_PIPE_ALL_CPU -1
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cbb..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
#include <linux/ktime.h>
#include <linux/trace_clock.h>
-#include "trace.h"
-
/*
* trace_clock_local(): the simplest and least coherent tracing clock.
*
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
return clock;
}
+EXPORT_SYMBOL_GPL(trace_clock_local);
/*
* trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)
local_irq_save(flags);
this_cpu = raw_smp_processor_id();
- now = cpu_clock(this_cpu);
+ now = sched_clock_cpu(this_cpu);
/*
* If in an NMI context then dont risk lockups and return the
* cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b946..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
- __common_field(int, padding);
return ret;
}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab7..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(tr);
}
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
-{
- struct trace_array *tr = func_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
-
- if (unlikely(!ftrace_function_enabled))
- return;
-
- pc = preempt_count();
- preempt_disable_notrace();
- local_save_flags(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1))
- trace_function(tr, ip, parent_ip, flags, pc);
-
- atomic_dec(&data->disabled);
- preempt_enable_notrace();
-}
-
/* Our option */
enum {
TRACE_FUNC_OPT_STACK = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
-
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
unsigned long flags;
- long disabled;
+ int bit;
int cpu;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
- /*
- * Need to use raw, since this must be called before the
- * recursive protection is performed.
- */
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ pc = preempt_count();
+ preempt_disable_notrace();
- if (likely(disabled == 1)) {
- pc = preempt_count();
+ bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+ if (bit < 0)
+ goto out;
+
+ cpu = smp_processor_id();
+ data = tr->data[cpu];
+ if (!atomic_read(&data->disabled)) {
+ local_save_flags(flags);
trace_function(tr, ip, parent_ip, flags, pc);
}
+ trace_clear_recursion(bit);
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ out:
+ preempt_enable_notrace();
}
static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
{
ftrace_function_enabled = 0;
- if (trace_flags & TRACE_ITER_PREEMPTONLY)
- trace_ops.func = function_trace_call_preempt_only;
- else
- trace_ops.func = function_trace_call;
-
if (func_flags.val & TRACE_FUNC_OPT_STACK)
register_ftrace_function(&trace_stack_ops);
else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7e..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
#define TRACE_GRAPH_PRINT_IRQS 0x40
+static unsigned int max_depth;
+
static struct tracer_opt trace_opts[] = {
/* Display overruns? (for self-debug purpose) */
{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
ftrace_pop_return_trace(&trace, &ret, frame_pointer);
trace.rettime = trace_clock_local();
- ftrace_graph_return(&trace);
barrier();
current->curr_ret_stack--;
+ /*
+ * The trace should run after decrementing the ret counter
+ * in case an interrupt were to come in. We don't want to
+ * lose the interrupt if max_depth is set.
+ */
+ ftrace_graph_return(&trace);
+
if (unlikely(!ret)) {
ftrace_graph_stop();
WARN_ON(1);
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return 0;
/* trace it when it is-nested-in or is a function enabled. */
- if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
- ftrace_graph_ignore_irqs())
+ if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
+ ftrace_graph_ignore_irqs()) ||
+ (max_depth && trace->depth >= max_depth))
return 0;
local_irq_save(flags);
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
#endif
};
+
+static ssize_t
+graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ max_depth = val;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
+ int n;
+
+ n = sprintf(buf, "%d\n", max_depth);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static const struct file_operations graph_depth_fops = {
+ .open = tracing_open_generic,
+ .write = graph_depth_write,
+ .read = graph_depth_read,
+ .llseek = generic_file_llseek,
+};
+
+static __init int init_graph_debugfs(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ trace_create_file("max_graph_depth", 0644, d_tracer,
+ NULL, &graph_depth_fops);
+
+ return 0;
+}
+fs_initcall(init_graph_debugfs);
+
static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
#define TP_FLAG_TRACE 1
#define TP_FLAG_PROFILE 2
#define TP_FLAG_REGISTERED 4
-#define TP_FLAG_UPROBE 8
/* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fcefca0..75aa97fbe1a1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,8 +15,8 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/sched/rt.h>
#include <trace/events/sched.h>
-
#include "trace.h"
static struct trace_array *wakeup_trace;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a815..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
* The ftrace infrastructure should provide the recursion
* protection. If not, this will crash the kernel!
*/
- trace_selftest_recursion_cnt++;
+ if (trace_selftest_recursion_cnt++ > 10)
+ return;
DYN_FTRACE_TEST_NAME();
}
@@ -452,7 +453,6 @@ trace_selftest_function_recursion(void)
char *func_name;
int len;
int ret;
- int cnt;
/* The previous test PASSED */
pr_cont("PASSED\n");
@@ -510,19 +510,10 @@ trace_selftest_function_recursion(void)
unregister_ftrace_function(&test_recsafe_probe);
- /*
- * If arch supports all ftrace features, and no other task
- * was on the list, we should be fine.
- */
- if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
- cnt = 2; /* Should have recursed */
- else
- cnt = 1;
-
ret = -1;
- if (trace_selftest_recursion_cnt != cnt) {
- pr_cont("*callback not called expected %d times (%d)* ",
- cnt, trace_selftest_recursion_cnt);
+ if (trace_selftest_recursion_cnt != 2) {
+ pr_cont("*callback not called expected 2 times (%d)* ",
+ trace_selftest_recursion_cnt);
goto out;
}
@@ -568,7 +559,7 @@ trace_selftest_function_regs(void)
int ret;
int supported = 0;
-#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
supported = 1;
#endif
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c2..5329e13e74a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -77,7 +77,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
return syscalls_metadata[nr];
}
-enum print_line_t
+static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -130,7 +130,7 @@ end:
return TRACE_TYPE_HANDLED;
}
-enum print_line_t
+static enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -270,7 +270,7 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
-void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
@@ -305,7 +305,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
@@ -337,7 +337,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_call *call)
{
int ret = 0;
int num;
@@ -356,7 +356,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
return ret;
}
-void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_call *call)
{
int num;
@@ -371,7 +371,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
-int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_call *call)
{
int ret = 0;
int num;
@@ -390,7 +390,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
return ret;
}
-void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_call *call)
{
int num;
@@ -459,7 +459,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
return (unsigned long)sys_call_table[nr];
}
-int __init init_ftrace_syscalls(void)
+static int __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c86e6d4f67fb..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,20 +28,21 @@
#define UPROBE_EVENT_SYSTEM "uprobes"
+struct trace_uprobe_filter {
+ rwlock_t rwlock;
+ int nr_systemwide;
+ struct list_head perf_events;
+};
+
/*
* uprobe event core functions
*/
-struct trace_uprobe;
-struct uprobe_trace_consumer {
- struct uprobe_consumer cons;
- struct trace_uprobe *tu;
-};
-
struct trace_uprobe {
struct list_head list;
struct ftrace_event_class class;
struct ftrace_event_call call;
- struct uprobe_trace_consumer *consumer;
+ struct trace_uprobe_filter filter;
+ struct uprobe_consumer consumer;
struct inode *inode;
char *filename;
unsigned long offset;
@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
+{
+ rwlock_init(&filter->rwlock);
+ filter->nr_systemwide = 0;
+ INIT_LIST_HEAD(&filter->perf_events);
+}
+
+static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
+{
+ return !filter->nr_systemwide && list_empty(&filter->perf_events);
+}
+
/*
* Allocate new trace_uprobe and initialize it (including uprobes).
*/
@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
goto error;
INIT_LIST_HEAD(&tu->list);
+ tu->consumer.handler = uprobe_dispatcher;
+ init_trace_uprobe_filter(&tu->filter);
return tu;
error:
@@ -253,12 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
if (ret)
goto fail_address_parse;
+ inode = igrab(path.dentry->d_inode);
+ path_put(&path);
+
+ if (!inode || !S_ISREG(inode->i_mode)) {
+ ret = -EINVAL;
+ goto fail_address_parse;
+ }
+
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
- inode = igrab(path.dentry->d_inode);
-
argc -= 2;
argv += 2;
@@ -356,7 +377,7 @@ fail_address_parse:
if (inode)
iput(inode);
- pr_info("Failed to parse address.\n");
+ pr_info("Failed to parse address or file.\n");
return ret;
}
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
};
/* uprobe handler */
-static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
{
struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
unsigned long irq_flags;
struct ftrace_event_call *call = &tu->call;
- tu->nhit++;
-
local_save_flags(irq_flags);
pc = preempt_count();
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
size, irq_flags, pc);
if (!event)
- return;
+ return 0;
entry = ring_buffer_event_data(event);
- entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+ entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1];
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
if (!filter_current_check_discard(buffer, call, entry, event))
trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+ return 0;
}
/* Event entry printers */
@@ -533,42 +554,43 @@ partial:
return TRACE_TYPE_PARTIAL_LINE;
}
-static int probe_event_enable(struct trace_uprobe *tu, int flag)
+static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
{
- struct uprobe_trace_consumer *utc;
- int ret = 0;
+ return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
+}
- if (!tu->inode || tu->consumer)
- return -EINTR;
+typedef bool (*filter_func_t)(struct uprobe_consumer *self,
+ enum uprobe_filter_ctx ctx,
+ struct mm_struct *mm);
- utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
- if (!utc)
+static int
+probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
+{
+ int ret = 0;
+
+ if (is_trace_uprobe_enabled(tu))
return -EINTR;
- utc->cons.handler = uprobe_dispatcher;
- utc->cons.filter = NULL;
- ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
- if (ret) {
- kfree(utc);
- return ret;
- }
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
tu->flags |= flag;
- utc->tu = tu;
- tu->consumer = utc;
+ tu->consumer.filter = filter;
+ ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ if (ret)
+ tu->flags &= ~flag;
- return 0;
+ return ret;
}
static void probe_event_disable(struct trace_uprobe *tu, int flag)
{
- if (!tu->inode || !tu->consumer)
+ if (!is_trace_uprobe_enabled(tu))
return;
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+ uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->flags &= ~flag;
- kfree(tu->consumer);
- tu->consumer = NULL;
}
static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
}
#ifdef CONFIG_PERF_EVENTS
+static bool
+__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
+{
+ struct perf_event *event;
+
+ if (filter->nr_systemwide)
+ return true;
+
+ list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
+ if (event->hw.tp_target->mm == mm)
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool
+uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+{
+ return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+}
+
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+{
+ bool done;
+
+ write_lock(&tu->filter.rwlock);
+ if (event->hw.tp_target) {
+ /*
+ * event->parent != NULL means copy_process(), we can avoid
+ * uprobe_apply(). current->mm must be probed and we can rely
+ * on dup_mmap() which preserves the already installed bp's.
+ *
+ * attr.enable_on_exec means that exec/mmap will install the
+ * breakpoints we need.
+ */
+ done = tu->filter.nr_systemwide ||
+ event->parent || event->attr.enable_on_exec ||
+ uprobe_filter_event(tu, event);
+ list_add(&event->hw.tp_list, &tu->filter.perf_events);
+ } else {
+ done = tu->filter.nr_systemwide;
+ tu->filter.nr_systemwide++;
+ }
+ write_unlock(&tu->filter.rwlock);
+
+ if (!done)
+ uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+
+ return 0;
+}
+
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+{
+ bool done;
+
+ write_lock(&tu->filter.rwlock);
+ if (event->hw.tp_target) {
+ list_del(&event->hw.tp_list);
+ done = tu->filter.nr_systemwide ||
+ (event->hw.tp_target->flags & PF_EXITING) ||
+ uprobe_filter_event(tu, event);
+ } else {
+ tu->filter.nr_systemwide--;
+ done = tu->filter.nr_systemwide;
+ }
+ write_unlock(&tu->filter.rwlock);
+
+ if (!done)
+ uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+
+ return 0;
+}
+
+static bool uprobe_perf_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct trace_uprobe *tu;
+ int ret;
+
+ tu = container_of(uc, struct trace_uprobe, consumer);
+ read_lock(&tu->filter.rwlock);
+ ret = __uprobe_perf_filter(&tu->filter, mm);
+ read_unlock(&tu->filter.rwlock);
+
+ return ret;
+}
+
/* uprobe profile handler */
-static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tu->call;
struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
int size, __size, i;
int rctx;
+ if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ return UPROBE_HANDLER_REMOVE;
+
__size = sizeof(*entry) + tu->size;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
- return;
+ return 0;
preempt_disable();
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
if (!entry)
goto out;
- entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+ entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1];
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
out:
preempt_enable();
+ return 0;
}
#endif /* CONFIG_PERF_EVENTS */
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
switch (type) {
case TRACE_REG_REGISTER:
- return probe_event_enable(tu, TP_FLAG_TRACE);
+ return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
case TRACE_REG_UNREGISTER:
probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return probe_event_enable(tu, TP_FLAG_PROFILE);
+ return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
case TRACE_REG_PERF_UNREGISTER:
probe_event_disable(tu, TP_FLAG_PROFILE);
return 0;
+
+ case TRACE_REG_PERF_OPEN:
+ return uprobe_perf_open(tu, data);
+
+ case TRACE_REG_PERF_CLOSE:
+ return uprobe_perf_close(tu, data);
+
#endif
default:
return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
{
- struct uprobe_trace_consumer *utc;
struct trace_uprobe *tu;
+ int ret = 0;
- utc = container_of(con, struct uprobe_trace_consumer, cons);
- tu = utc->tu;
- if (!tu || tu->consumer != utc)
- return 0;
+ tu = container_of(con, struct trace_uprobe, consumer);
+ tu->nhit++;
if (tu->flags & TP_FLAG_TRACE)
- uprobe_trace_func(tu, regs);
+ ret |= uprobe_trace_func(tu, regs);
#ifdef CONFIG_PERF_EVENTS
if (tu->flags & TP_FLAG_PROFILE)
- uprobe_perf_func(tu, regs);
+ ret |= uprobe_perf_func(tu, regs);
#endif
- return 0;
+ return ret;
}
static struct trace_event_functions uprobe_funcs = {
OpenPOWER on IntegriCloud