summaryrefslogtreecommitdiffstats
path: root/tools/perf/builtin-trace.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/builtin-trace.c')
-rw-r--r--tools/perf/builtin-trace.c490
1 files changed, 415 insertions, 75 deletions
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 3ad17ee89403..835619476370 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -77,7 +77,8 @@ struct trace {
struct syscall *table;
struct {
struct perf_evsel *sys_enter,
- *sys_exit;
+ *sys_exit,
+ *augmented;
} events;
} syscalls;
struct record_opts opts;
@@ -88,6 +89,8 @@ struct trace {
u64 base_time;
FILE *output;
unsigned long nr_events;
+ unsigned long nr_events_printed;
+ unsigned long max_events;
struct strlist *ev_qualifier;
struct {
size_t nr;
@@ -105,6 +108,7 @@ struct trace {
} stats;
unsigned int max_stack;
unsigned int min_stack;
+ bool raw_augmented_syscalls;
bool not_ev_qualifier;
bool live;
bool full_time;
@@ -121,7 +125,6 @@ struct trace {
bool force;
bool vfs_getname;
int trace_pgfaults;
- int open_id;
};
struct tp_field {
@@ -157,13 +160,11 @@ TP_UINT_FIELD__SWAPPED(16);
TP_UINT_FIELD__SWAPPED(32);
TP_UINT_FIELD__SWAPPED(64);
-static int tp_field__init_uint(struct tp_field *field,
- struct format_field *format_field,
- bool needs_swap)
+static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
{
- field->offset = format_field->offset;
+ field->offset = offset;
- switch (format_field->size) {
+ switch (size) {
case 1:
field->integer = tp_field__u8;
break;
@@ -183,18 +184,28 @@ static int tp_field__init_uint(struct tp_field *field,
return 0;
}
+static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
+{
+ return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
+}
+
static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
{
return sample->raw_data + field->offset;
}
-static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
+static int __tp_field__init_ptr(struct tp_field *field, int offset)
{
- field->offset = format_field->offset;
+ field->offset = offset;
field->pointer = tp_field__ptr;
return 0;
}
+static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
+{
+ return __tp_field__init_ptr(field, format_field->offset);
+}
+
struct syscall_tp {
struct tp_field id;
union {
@@ -206,7 +217,7 @@ static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
struct tp_field *field,
const char *name)
{
- struct format_field *format_field = perf_evsel__field(evsel, name);
+ struct tep_format_field *format_field = perf_evsel__field(evsel, name);
if (format_field == NULL)
return -1;
@@ -222,7 +233,7 @@ static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
struct tp_field *field,
const char *name)
{
- struct format_field *format_field = perf_evsel__field(evsel, name);
+ struct tep_format_field *format_field = perf_evsel__field(evsel, name);
if (format_field == NULL)
return -1;
@@ -240,7 +251,54 @@ static void perf_evsel__delete_priv(struct perf_evsel *evsel)
perf_evsel__delete(evsel);
}
-static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
+static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
+{
+ struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
+
+ if (evsel->priv != NULL) {
+ if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
+ goto out_delete;
+ return 0;
+ }
+
+ return -ENOMEM;
+out_delete:
+ zfree(&evsel->priv);
+ return -ENOENT;
+}
+
+static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
+{
+ struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
+
+ if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */
+ if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
+ goto out_delete;
+
+ return 0;
+ }
+
+ return -ENOMEM;
+out_delete:
+ zfree(&evsel->priv);
+ return -EINVAL;
+}
+
+static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
+{
+ struct syscall_tp *sc = evsel->priv;
+
+ return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
+}
+
+static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel)
+{
+ struct syscall_tp *sc = evsel->priv;
+
+ return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
+}
+
+static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
{
evsel->priv = malloc(sizeof(struct syscall_tp));
if (evsel->priv != NULL) {
@@ -258,7 +316,7 @@ out_delete:
return -ENOENT;
}
-static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
+static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
{
struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
@@ -269,7 +327,7 @@ static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void
if (IS_ERR(evsel))
return NULL;
- if (perf_evsel__init_syscall_tp(evsel, handler))
+ if (perf_evsel__init_raw_syscall_tp(evsel, handler))
goto out_delete;
return evsel;
@@ -291,7 +349,7 @@ size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const cha
{
int idx = val - sa->offset;
- if (idx < 0 || idx >= sa->nr_entries)
+ if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
return scnprintf(bf, size, intfmt, val);
return scnprintf(bf, size, "%s", sa->entries[idx]);
@@ -450,16 +508,6 @@ static const char *clockid[] = {
};
static DEFINE_STRARRAY(clockid);
-static const char *socket_families[] = {
- "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
- "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
- "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
- "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
- "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
- "ALG", "NFC", "VSOCK",
-};
-static DEFINE_STRARRAY(socket_families);
-
static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
struct syscall_arg *arg)
{
@@ -567,6 +615,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
struct syscall_arg_fmt {
size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
+ unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
void *parm;
const char *name;
bool show_zero;
@@ -583,6 +632,8 @@ static struct syscall_fmt {
} syscall_fmts[] = {
{ .name = "access",
.arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
+ { .name = "bind",
+ .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, },
{ .name = "bpf",
.arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
{ .name = "brk", .hexret = true,
@@ -597,6 +648,8 @@ static struct syscall_fmt {
[4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
{ .name = "close",
.arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
+ { .name = "connect",
+ .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, },
{ .name = "epoll_ctl",
.arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
{ .name = "eventfd2",
@@ -674,6 +727,10 @@ static struct syscall_fmt {
.arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
[2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
[3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
+ { .name = "mount",
+ .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
+ [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
+ .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
{ .name = "mprotect",
.arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
[2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
@@ -753,7 +810,8 @@ static struct syscall_fmt {
{ .name = "sendmsg",
.arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
{ .name = "sendto",
- .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
+ .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
+ [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
{ .name = "set_tid_address", .errpid = true, },
{ .name = "setitimer",
.arg = { [0] = STRARRAY(which, itimers), }, },
@@ -761,10 +819,12 @@ static struct syscall_fmt {
.arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
{ .name = "socket",
.arg = { [0] = STRARRAY(family, socket_families),
- [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
+ [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
+ [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
{ .name = "socketpair",
.arg = { [0] = STRARRAY(family, socket_families),
- [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
+ [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
+ [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
{ .name = "stat", .alias = "newstat", },
{ .name = "statx",
.arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
@@ -780,6 +840,8 @@ static struct syscall_fmt {
.arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
{ .name = "tkill",
.arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
+ { .name = "umount2", .alias = "umount",
+ .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
{ .name = "uname", .alias = "newuname", },
{ .name = "unlinkat",
.arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
@@ -803,12 +865,31 @@ static struct syscall_fmt *syscall_fmt__find(const char *name)
return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
}
+static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
+{
+ int i, nmemb = ARRAY_SIZE(syscall_fmts);
+
+ for (i = 0; i < nmemb; ++i) {
+ if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0)
+ return &syscall_fmts[i];
+ }
+
+ return NULL;
+}
+
+/*
+ * is_exit: is this "exit" or "exit_group"?
+ * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
+ * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
+ */
struct syscall {
- struct event_format *tp_format;
+ struct tep_event_format *tp_format;
int nr_args;
- struct format_field *args;
- const char *name;
+ int args_size;
bool is_exit;
+ bool is_open;
+ struct tep_format_field *args;
+ const char *name;
struct syscall_fmt *fmt;
struct syscall_arg_fmt *arg_fmt;
};
@@ -1040,11 +1121,21 @@ static void thread__set_filename_pos(struct thread *thread, const char *bf,
ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
}
+static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
+{
+ struct augmented_arg *augmented_arg = arg->augmented.args;
+
+ return scnprintf(bf, size, "%.*s", augmented_arg->size, augmented_arg->value);
+}
+
static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
struct syscall_arg *arg)
{
unsigned long ptr = arg->val;
+ if (arg->augmented.args)
+ return syscall_arg__scnprintf_augmented_string(arg, bf, size);
+
if (!arg->trace->vfs_getname)
return scnprintf(bf, size, "%#x", ptr);
@@ -1087,11 +1178,9 @@ static void sig_handler(int sig)
interrupted = sig == SIGINT;
}
-static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
- u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
+static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
{
- size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
- printed += fprintf_duration(duration, duration_calculated, fp);
+ size_t printed = 0;
if (trace->multiple_threads) {
if (trace->show_comm)
@@ -1102,6 +1191,14 @@ static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thre
return printed;
}
+static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
+ u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
+{
+ size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
+ printed += fprintf_duration(duration, duration_calculated, fp);
+ return printed + trace__fprintf_comm_tid(trace, thread, fp);
+}
+
static int trace__process_event(struct trace *trace, struct machine *machine,
union perf_event *event, struct perf_sample *sample)
{
@@ -1203,10 +1300,12 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
static int syscall__set_arg_fmts(struct syscall *sc)
{
- struct format_field *field;
+ struct tep_format_field *field, *last_field = NULL;
int idx = 0, len;
for (field = sc->args; field; field = field->next, ++idx) {
+ last_field = field;
+
if (sc->fmt && sc->fmt->arg[idx].scnprintf)
continue;
@@ -1215,7 +1314,7 @@ static int syscall__set_arg_fmts(struct syscall *sc)
strcmp(field->name, "path") == 0 ||
strcmp(field->name, "pathname") == 0))
sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
- else if (field->flags & FIELD_IS_POINTER)
+ else if (field->flags & TEP_FIELD_IS_POINTER)
sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
else if (strcmp(field->type, "pid_t") == 0)
sc->arg_fmt[idx].scnprintf = SCA_PID;
@@ -1237,6 +1336,9 @@ static int syscall__set_arg_fmts(struct syscall *sc)
}
}
+ if (last_field)
+ sc->args_size = last_field->offset + last_field->size;
+
return 0;
}
@@ -1297,6 +1399,7 @@ static int trace__read_syscall_info(struct trace *trace, int id)
}
sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
+ sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
return syscall__set_arg_fmts(sc);
}
@@ -1403,6 +1506,19 @@ static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
return scnprintf(bf, size, "arg%d: ", arg->idx);
}
+/*
+ * Check if the value is in fact zero, i.e. mask whatever needs masking, such
+ * as mount 'flags' argument that needs ignoring some magic flag, see comment
+ * in tools/perf/trace/beauty/mount_flags.c
+ */
+static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val)
+{
+ if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val)
+ return sc->arg_fmt[arg->idx].mask_val(arg, val);
+
+ return val;
+}
+
static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
struct syscall_arg *arg, unsigned long val)
{
@@ -1416,14 +1532,18 @@ static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
}
static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
- unsigned char *args, struct trace *trace,
- struct thread *thread)
+ unsigned char *args, void *augmented_args, int augmented_args_size,
+ struct trace *trace, struct thread *thread)
{
size_t printed = 0;
unsigned long val;
u8 bit = 1;
struct syscall_arg arg = {
.args = args,
+ .augmented = {
+ .size = augmented_args_size,
+ .args = augmented_args,
+ },
.idx = 0,
.mask = 0,
.trace = trace,
@@ -1439,7 +1559,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
ttrace->ret_scnprintf = NULL;
if (sc->args != NULL) {
- struct format_field *field;
+ struct tep_format_field *field;
for (field = sc->args; field;
field = field->next, ++arg.idx, bit <<= 1) {
@@ -1447,6 +1567,11 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
continue;
val = syscall_arg__val(&arg, arg.idx);
+ /*
+ * Some syscall args need some mask, most don't and
+ * return val untouched.
+ */
+ val = syscall__mask_val(sc, &arg, val);
/*
* Suppress this argument if its value is zero and
@@ -1578,6 +1703,8 @@ static int trace__printf_interrupted_entry(struct trace *trace)
printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
ttrace->entry_pending = false;
+ ++trace->nr_events_printed;
+
return printed;
}
@@ -1598,6 +1725,32 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
return printed;
}
+static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, bool raw_augmented)
+{
+ void *augmented_args = NULL;
+ /*
+ * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
+ * and there we get all 6 syscall args plus the tracepoint common
+ * fields (sizeof(long)) and the syscall_nr (another long). So we check
+ * if that is the case and if so don't look after the sc->args_size,
+ * but always after the full raw_syscalls:sys_enter payload, which is
+ * fixed.
+ *
+ * We'll revisit this later to pass s->args_size to the BPF augmenter
+ * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
+ * copies only what we need for each syscall, like what happens when we
+ * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
+ * traffic to just what is needed for each syscall.
+ */
+ int args_size = raw_augmented ? (8 * (int)sizeof(long)) : sc->args_size;
+
+ *augmented_args_size = sample->raw_size - args_size;
+ if (*augmented_args_size > 0)
+ augmented_args = sample->raw_data + args_size;
+
+ return augmented_args;
+}
+
static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
@@ -1607,6 +1760,8 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
size_t printed = 0;
struct thread *thread;
int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
+ int augmented_args_size = 0;
+ void *augmented_args = NULL;
struct syscall *sc = trace__syscall_info(trace, evsel, id);
struct thread_trace *ttrace;
@@ -1630,13 +1785,24 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
trace__printf_interrupted_entry(trace);
-
+ /*
+ * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
+ * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
+ * this breaks syscall__augmented_args() check for augmented args, as we calculate
+ * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
+ * so when handling, say the openat syscall, we end up getting 6 args for the
+ * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
+ * thinking that the extra 2 u64 args are the augmented filename, so just check
+ * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
+ */
+ if (evsel != trace->syscalls.events.sys_enter)
+ augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls);
ttrace->entry_time = sample->time;
msg = ttrace->entry_str;
printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
- args, trace, thread);
+ args, augmented_args, augmented_args_size, trace, thread);
if (sc->is_exit) {
if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
@@ -1659,6 +1825,39 @@ out_put:
return err;
}
+static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
+ struct perf_sample *sample)
+{
+ struct thread_trace *ttrace;
+ struct thread *thread;
+ int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
+ struct syscall *sc = trace__syscall_info(trace, evsel, id);
+ char msg[1024];
+ void *args, *augmented_args = NULL;
+ int augmented_args_size;
+
+ if (sc == NULL)
+ return -1;
+
+ thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+ ttrace = thread__trace(thread, trace->output);
+ /*
+ * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
+ * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
+ */
+ if (ttrace == NULL)
+ goto out_put;
+
+ args = perf_evsel__sc_tp_ptr(evsel, args, sample);
+ augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls);
+ syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
+ fprintf(trace->output, "%s", msg);
+ err = 0;
+out_put:
+ thread__put(thread);
+ return err;
+}
+
static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
struct perf_sample *sample,
struct callchain_cursor *cursor)
@@ -1667,12 +1866,14 @@ static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evse
int max_stack = evsel->attr.sample_max_stack ?
evsel->attr.sample_max_stack :
trace->max_stack;
+ int err;
- if (machine__resolve(trace->host, &al, sample) < 0 ||
- thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
+ if (machine__resolve(trace->host, &al, sample) < 0)
return -1;
- return 0;
+ err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
+ addr_location__put(&al);
+ return err;
}
static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
@@ -1720,7 +1921,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
- if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
+ if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
trace__set_fd_pathname(thread, ret, ttrace->filename.name);
ttrace->filename.pending_open = false;
++trace->stats.vfs_getname;
@@ -1797,6 +1998,13 @@ errno_print: {
fputc('\n', trace->output);
+ /*
+ * We only consider an 'event' for the sake of --max-events a non-filtered
+ * sys_enter + sys_exit and other tracepoint events.
+ */
+ if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
+ interrupted = true;
+
if (callchain_ret > 0)
trace__fprintf_callchain(trace, sample);
else if (callchain_ret < 0)
@@ -1929,13 +2137,25 @@ static void bpf_output__fprintf(struct trace *trace,
{
binary__fprintf(sample->raw_data, sample->raw_size, 8,
bpf_output__printer, NULL, trace->output);
+ ++trace->nr_events_printed;
}
static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
{
+ struct thread *thread;
int callchain_ret = 0;
+ /*
+ * Check if we called perf_evsel__disable(evsel) due to, for instance,
+ * this event's max_events having been hit and this is an entry coming
+ * from the ring buffer that we should discard, since the max events
+ * have already been considered/printed.
+ */
+ if (evsel->disabled)
+ return 0;
+
+ thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
if (sample->callchain) {
callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
@@ -1952,16 +2172,47 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
if (trace->trace_syscalls)
fprintf(trace->output, "( ): ");
+ if (thread)
+ trace__fprintf_comm_tid(trace, thread, trace->output);
+
+ if (evsel == trace->syscalls.events.augmented) {
+ int id = perf_evsel__sc_tp_uint(evsel, id, sample);
+ struct syscall *sc = trace__syscall_info(trace, evsel, id);
+
+ if (sc) {
+ fprintf(trace->output, "%s(", sc->name);
+ trace__fprintf_sys_enter(trace, evsel, sample);
+ fputc(')', trace->output);
+ goto newline;
+ }
+
+ /*
+ * XXX: Not having the associated syscall info or not finding/adding
+ * the thread should never happen, but if it does...
+ * fall thru and print it as a bpf_output event.
+ */
+ }
+
fprintf(trace->output, "%s:", evsel->name);
if (perf_evsel__is_bpf_output(evsel)) {
bpf_output__fprintf(trace, sample);
} else if (evsel->tp_format) {
- event_format__fprintf(evsel->tp_format, sample->cpu,
- sample->raw_data, sample->raw_size,
- trace->output);
+ if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
+ trace__fprintf_sys_enter(trace, evsel, sample)) {
+ event_format__fprintf(evsel->tp_format, sample->cpu,
+ sample->raw_data, sample->raw_size,
+ trace->output);
+ ++trace->nr_events_printed;
+
+ if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
+ perf_evsel__disable(evsel);
+ perf_evsel__close(evsel);
+ }
+ }
}
+newline:
fprintf(trace->output, "\n");
if (callchain_ret > 0)
@@ -1969,6 +2220,7 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
else if (callchain_ret < 0)
pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
out:
+ thread__put(thread);
return 0;
}
@@ -2024,8 +2276,7 @@ static int trace__pgfault(struct trace *trace,
if (trace->summary_only)
goto out;
- thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
- sample->ip, &al);
+ thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
@@ -2037,12 +2288,10 @@ static int trace__pgfault(struct trace *trace,
fprintf(trace->output, "] => ");
- thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
- sample->addr, &al);
+ thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
if (!al.map) {
- thread__find_addr_location(thread, sample->cpumode,
- MAP__FUNCTION, sample->addr, &al);
+ thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
if (al.map)
map_type = 'x';
@@ -2058,6 +2307,8 @@ static int trace__pgfault(struct trace *trace,
trace__fprintf_callchain(trace, sample);
else if (callchain_ret < 0)
pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
+
+ ++trace->nr_events_printed;
out:
err = 0;
out_put:
@@ -2235,6 +2486,9 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st
tracepoint_handler handler = evsel->handler;
handler(trace, evsel, event, sample);
}
+
+ if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
+ interrupted = true;
}
static int trace__add_syscall_newtp(struct trace *trace)
@@ -2243,14 +2497,14 @@ static int trace__add_syscall_newtp(struct trace *trace)
struct perf_evlist *evlist = trace->evlist;
struct perf_evsel *sys_enter, *sys_exit;
- sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
+ sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
if (sys_enter == NULL)
goto out;
if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
goto out_delete_sys_enter;
- sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
+ sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
if (sys_exit == NULL)
goto out_delete_sys_enter;
@@ -2494,7 +2748,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
* to override an explicitely set --max-stack global setting.
*/
evlist__for_each_entry(evlist, evsel) {
- if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
+ if (evsel__has_callchain(evsel) &&
evsel->attr.sample_max_stack == 0)
evsel->attr.sample_max_stack = trace->max_stack;
}
@@ -2539,7 +2793,7 @@ next_event:
int timeout = done ? 100 : -1;
if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
- if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
+ if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
draining = true;
goto again;
@@ -2672,7 +2926,7 @@ static int trace__replay(struct trace *trace)
"syscalls:sys_enter");
if (evsel &&
- (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
+ (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
pr_err("Error during initialize raw_syscalls:sys_enter event\n");
goto out;
@@ -2684,7 +2938,7 @@ static int trace__replay(struct trace *trace)
evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
"syscalls:sys_exit");
if (evsel &&
- (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
+ (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
pr_err("Error during initialize raw_syscalls:sys_exit event\n");
goto out;
@@ -2924,6 +3178,36 @@ static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
evsel->handler = handler;
}
+static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel;
+
+ evlist__for_each_entry(evlist, evsel) {
+ if (evsel->priv || !evsel->tp_format)
+ continue;
+
+ if (strcmp(evsel->tp_format->system, "syscalls"))
+ continue;
+
+ if (perf_evsel__init_syscall_tp(evsel))
+ return -1;
+
+ if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
+ struct syscall_tp *sc = evsel->priv;
+
+ if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
+ return -1;
+ } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
+ struct syscall_tp *sc = evsel->priv;
+
+ if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
/*
* XXX: Hackish, just splitting the combined -e+--event (syscalls
* (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
@@ -2941,6 +3225,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
int len = strlen(str) + 1, err = -1, list, idx;
char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
char group_name[PATH_MAX];
+ struct syscall_fmt *fmt;
if (strace_groups_dir == NULL)
return -1;
@@ -2958,12 +3243,19 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
if (syscalltbl__id(trace->sctbl, s) >= 0 ||
syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
list = 1;
+ goto do_concat;
+ }
+
+ fmt = syscall_fmt__find_by_alias(s);
+ if (fmt != NULL) {
+ list = 1;
+ s = fmt->name;
} else {
path__join(group_name, sizeof(group_name), strace_groups_dir, s);
if (access(group_name, R_OK) == 0)
list = 1;
}
-
+do_concat:
if (lists[list]) {
sprintf(lists[list] + strlen(lists[list]), ",%s", s);
} else {
@@ -2993,6 +3285,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
if (trace__validate_ev_qualifier(trace))
goto out;
+ trace->trace_syscalls = true;
}
err = 0;
@@ -3048,9 +3341,10 @@ int cmd_trace(int argc, const char **argv)
},
.output = stderr,
.show_comm = true,
- .trace_syscalls = true,
+ .trace_syscalls = false,
.kernel_syscallchains = false,
.max_stack = UINT_MAX,
+ .max_events = ULONG_MAX,
};
const char *output_name = NULL;
const struct option trace_options[] = {
@@ -3103,6 +3397,8 @@ int cmd_trace(int argc, const char **argv)
&record_parse_callchain_opt),
OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
"Show the kernel callchains on the syscall exit path"),
+ OPT_ULONG(0, "max-events", &trace.max_events,
+ "Set the maximum number of events to print, exit after that is reached. "),
OPT_UINTEGER(0, "min-stack", &trace.min_stack,
"Set the minimum stack depth when parsing the callchain, "
"anything below the specified depth will be ignored."),
@@ -3123,8 +3419,9 @@ int cmd_trace(int argc, const char **argv)
};
bool __maybe_unused max_stack_user_set = true;
bool mmap_pages_user_set = true;
+ struct perf_evsel *evsel;
const char * const trace_subcommands[] = { "record", NULL };
- int err;
+ int err = -1;
char bf[BUFSIZ];
signal(SIGSEGV, sighandler_dump_stack);
@@ -3147,6 +3444,16 @@ int cmd_trace(int argc, const char **argv)
"cgroup monitoring only available in system-wide mode");
}
+ evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
+ if (IS_ERR(evsel)) {
+ bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
+ pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
+ goto out;
+ }
+
+ if (evsel)
+ trace.syscalls.events.augmented = evsel;
+
err = bpf__setup_stdout(trace.evlist);
if (err) {
bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
@@ -3165,7 +3472,7 @@ int cmd_trace(int argc, const char **argv)
mmap_pages_user_set = false;
if (trace.max_stack == UINT_MAX) {
- trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
+ trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
max_stack_user_set = false;
}
@@ -3182,8 +3489,49 @@ int cmd_trace(int argc, const char **argv)
symbol_conf.use_callchain = true;
}
- if (trace.evlist->nr_entries > 0)
+ if (trace.evlist->nr_entries > 0) {
evlist__set_evsel_handler(trace.evlist, trace__event_handler);
+ if (evlist__set_syscall_tp_fields(trace.evlist)) {
+ perror("failed to set syscalls:* tracepoint fields");
+ goto out;
+ }
+ }
+
+ /*
+ * If we are augmenting syscalls, then combine what we put in the
+ * __augmented_syscalls__ BPF map with what is in the
+ * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
+ * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
+ *
+ * We'll switch to look at two BPF maps, one for sys_enter and the
+ * other for sys_exit when we start augmenting the sys_exit paths with
+ * buffers that are being copied from kernel to userspace, think 'read'
+ * syscall.
+ */
+ if (trace.syscalls.events.augmented) {
+ evsel = trace.syscalls.events.augmented;
+
+ if (perf_evsel__init_augmented_syscall_tp(evsel) ||
+ perf_evsel__init_augmented_syscall_tp_args(evsel))
+ goto out;
+ evsel->handler = trace__sys_enter;
+
+ evlist__for_each_entry(trace.evlist, evsel) {
+ bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
+
+ if (raw_syscalls_sys_exit) {
+ trace.raw_augmented_syscalls = true;
+ goto init_augmented_syscall_tp;
+ }
+
+ if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
+init_augmented_syscall_tp:
+ perf_evsel__init_augmented_syscall_tp(evsel);
+ perf_evsel__init_augmented_syscall_tp_ret(evsel);
+ evsel->handler = trace__sys_exit;
+ }
+ }
+ }
if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
return trace__record(&trace, argc-1, &argv[1]);
@@ -3194,13 +3542,7 @@ int cmd_trace(int argc, const char **argv)
if (!trace.trace_syscalls && !trace.trace_pgfaults &&
trace.evlist->nr_entries == 0 /* Was --events used? */) {
- pr_err("Please specify something to trace.\n");
- return -1;
- }
-
- if (!trace.trace_syscalls && trace.ev_qualifier) {
- pr_err("The -e option can't be used with --no-syscalls.\n");
- goto out;
+ trace.trace_syscalls = true;
}
if (output_name != NULL) {
@@ -3211,8 +3553,6 @@ int cmd_trace(int argc, const char **argv)
}
}
- trace.open_id = syscalltbl__id(trace.sctbl, "open");
-
err = target__validate(&trace.opts.target);
if (err) {
target__strerror(&trace.opts.target, err, bf, sizeof(bf));
OpenPOWER on IntegriCloud