diff options
Diffstat (limited to 'tools/perf/builtin-trace.c')
-rw-r--r-- | tools/perf/builtin-trace.c | 490 |
1 files changed, 415 insertions, 75 deletions
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 3ad17ee89403..835619476370 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -77,7 +77,8 @@ struct trace { struct syscall *table; struct { struct perf_evsel *sys_enter, - *sys_exit; + *sys_exit, + *augmented; } events; } syscalls; struct record_opts opts; @@ -88,6 +89,8 @@ struct trace { u64 base_time; FILE *output; unsigned long nr_events; + unsigned long nr_events_printed; + unsigned long max_events; struct strlist *ev_qualifier; struct { size_t nr; @@ -105,6 +108,7 @@ struct trace { } stats; unsigned int max_stack; unsigned int min_stack; + bool raw_augmented_syscalls; bool not_ev_qualifier; bool live; bool full_time; @@ -121,7 +125,6 @@ struct trace { bool force; bool vfs_getname; int trace_pgfaults; - int open_id; }; struct tp_field { @@ -157,13 +160,11 @@ TP_UINT_FIELD__SWAPPED(16); TP_UINT_FIELD__SWAPPED(32); TP_UINT_FIELD__SWAPPED(64); -static int tp_field__init_uint(struct tp_field *field, - struct format_field *format_field, - bool needs_swap) +static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap) { - field->offset = format_field->offset; + field->offset = offset; - switch (format_field->size) { + switch (size) { case 1: field->integer = tp_field__u8; break; @@ -183,18 +184,28 @@ static int tp_field__init_uint(struct tp_field *field, return 0; } +static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap) +{ + return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap); +} + static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) { return sample->raw_data + field->offset; } -static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) +static int __tp_field__init_ptr(struct tp_field *field, int offset) { - field->offset = format_field->offset; + field->offset = offset; field->pointer = tp_field__ptr; return 0; } +static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field) +{ + return __tp_field__init_ptr(field, format_field->offset); +} + struct syscall_tp { struct tp_field id; union { @@ -206,7 +217,7 @@ static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, struct tp_field *field, const char *name) { - struct format_field *format_field = perf_evsel__field(evsel, name); + struct tep_format_field *format_field = perf_evsel__field(evsel, name); if (format_field == NULL) return -1; @@ -222,7 +233,7 @@ static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, struct tp_field *field, const char *name) { - struct format_field *format_field = perf_evsel__field(evsel, name); + struct tep_format_field *format_field = perf_evsel__field(evsel, name); if (format_field == NULL) return -1; @@ -240,7 +251,54 @@ static void perf_evsel__delete_priv(struct perf_evsel *evsel) perf_evsel__delete(evsel); } -static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler) +static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel) +{ + struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); + + if (evsel->priv != NULL) { + if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr")) + goto out_delete; + return 0; + } + + return -ENOMEM; +out_delete: + zfree(&evsel->priv); + return -ENOENT; +} + +static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel) +{ + struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); + + if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */ + if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap)) + goto out_delete; + + return 0; + } + + return -ENOMEM; +out_delete: + zfree(&evsel->priv); + return -EINVAL; +} + +static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel) +{ + struct syscall_tp *sc = evsel->priv; + + return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)); +} + +static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel) +{ + struct syscall_tp *sc = evsel->priv; + + return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap); +} + +static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler) { evsel->priv = malloc(sizeof(struct syscall_tp)); if (evsel->priv != NULL) { @@ -258,7 +316,7 @@ out_delete: return -ENOENT; } -static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler) +static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler) { struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); @@ -269,7 +327,7 @@ static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void if (IS_ERR(evsel)) return NULL; - if (perf_evsel__init_syscall_tp(evsel, handler)) + if (perf_evsel__init_raw_syscall_tp(evsel, handler)) goto out_delete; return evsel; @@ -291,7 +349,7 @@ size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const cha { int idx = val - sa->offset; - if (idx < 0 || idx >= sa->nr_entries) + if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) return scnprintf(bf, size, intfmt, val); return scnprintf(bf, size, "%s", sa->entries[idx]); @@ -450,16 +508,6 @@ static const char *clockid[] = { }; static DEFINE_STRARRAY(clockid); -static const char *socket_families[] = { - "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", - "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", - "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", - "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", - "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", - "ALG", "NFC", "VSOCK", -}; -static DEFINE_STRARRAY(socket_families); - static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, struct syscall_arg *arg) { @@ -567,6 +615,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, struct syscall_arg_fmt { size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); + unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val); void *parm; const char *name; bool show_zero; @@ -583,6 +632,8 @@ static struct syscall_fmt { } syscall_fmts[] = { { .name = "access", .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, + { .name = "bind", + .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, }, { .name = "bpf", .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, { .name = "brk", .hexret = true, @@ -597,6 +648,8 @@ static struct syscall_fmt { [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, }, { .name = "close", .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, + { .name = "connect", + .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, }, { .name = "epoll_ctl", .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, { .name = "eventfd2", @@ -674,6 +727,10 @@ static struct syscall_fmt { .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, }, + { .name = "mount", + .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ }, + [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */ + .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "mprotect", .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, @@ -753,7 +810,8 @@ static struct syscall_fmt { { .name = "sendmsg", .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, { .name = "sendto", - .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, + .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, + [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, }, { .name = "set_tid_address", .errpid = true, }, { .name = "setitimer", .arg = { [0] = STRARRAY(which, itimers), }, }, @@ -761,10 +819,12 @@ static struct syscall_fmt { .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, { .name = "socket", .arg = { [0] = STRARRAY(family, socket_families), - [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, }, + [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, + [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, }, { .name = "socketpair", .arg = { [0] = STRARRAY(family, socket_families), - [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, }, + [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, + [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, }, { .name = "stat", .alias = "newstat", }, { .name = "statx", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, @@ -780,6 +840,8 @@ static struct syscall_fmt { .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, { .name = "tkill", .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, + { .name = "umount2", .alias = "umount", + .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, { .name = "uname", .alias = "newuname", }, { .name = "unlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, @@ -803,12 +865,31 @@ static struct syscall_fmt *syscall_fmt__find(const char *name) return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); } +static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) +{ + int i, nmemb = ARRAY_SIZE(syscall_fmts); + + for (i = 0; i < nmemb; ++i) { + if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0) + return &syscall_fmts[i]; + } + + return NULL; +} + +/* + * is_exit: is this "exit" or "exit_group"? + * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter. + * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc. + */ struct syscall { - struct event_format *tp_format; + struct tep_event_format *tp_format; int nr_args; - struct format_field *args; - const char *name; + int args_size; bool is_exit; + bool is_open; + struct tep_format_field *args; + const char *name; struct syscall_fmt *fmt; struct syscall_arg_fmt *arg_fmt; }; @@ -1040,11 +1121,21 @@ static void thread__set_filename_pos(struct thread *thread, const char *bf, ttrace->filename.entry_str_pos = bf - ttrace->entry_str; } +static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size) +{ + struct augmented_arg *augmented_arg = arg->augmented.args; + + return scnprintf(bf, size, "%.*s", augmented_arg->size, augmented_arg->value); +} + static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, struct syscall_arg *arg) { unsigned long ptr = arg->val; + if (arg->augmented.args) + return syscall_arg__scnprintf_augmented_string(arg, bf, size); + if (!arg->trace->vfs_getname) return scnprintf(bf, size, "%#x", ptr); @@ -1087,11 +1178,9 @@ static void sig_handler(int sig) interrupted = sig == SIGINT; } -static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, - u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) +static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp) { - size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); - printed += fprintf_duration(duration, duration_calculated, fp); + size_t printed = 0; if (trace->multiple_threads) { if (trace->show_comm) @@ -1102,6 +1191,14 @@ static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thre return printed; } +static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, + u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) +{ + size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); + printed += fprintf_duration(duration, duration_calculated, fp); + return printed + trace__fprintf_comm_tid(trace, thread, fp); +} + static int trace__process_event(struct trace *trace, struct machine *machine, union perf_event *event, struct perf_sample *sample) { @@ -1203,10 +1300,12 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) static int syscall__set_arg_fmts(struct syscall *sc) { - struct format_field *field; + struct tep_format_field *field, *last_field = NULL; int idx = 0, len; for (field = sc->args; field; field = field->next, ++idx) { + last_field = field; + if (sc->fmt && sc->fmt->arg[idx].scnprintf) continue; @@ -1215,7 +1314,7 @@ static int syscall__set_arg_fmts(struct syscall *sc) strcmp(field->name, "path") == 0 || strcmp(field->name, "pathname") == 0)) sc->arg_fmt[idx].scnprintf = SCA_FILENAME; - else if (field->flags & FIELD_IS_POINTER) + else if (field->flags & TEP_FIELD_IS_POINTER) sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex; else if (strcmp(field->type, "pid_t") == 0) sc->arg_fmt[idx].scnprintf = SCA_PID; @@ -1237,6 +1336,9 @@ static int syscall__set_arg_fmts(struct syscall *sc) } } + if (last_field) + sc->args_size = last_field->offset + last_field->size; + return 0; } @@ -1297,6 +1399,7 @@ static int trace__read_syscall_info(struct trace *trace, int id) } sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); + sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat"); return syscall__set_arg_fmts(sc); } @@ -1403,6 +1506,19 @@ static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size, return scnprintf(bf, size, "arg%d: ", arg->idx); } +/* + * Check if the value is in fact zero, i.e. mask whatever needs masking, such + * as mount 'flags' argument that needs ignoring some magic flag, see comment + * in tools/perf/trace/beauty/mount_flags.c + */ +static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val) +{ + if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val) + return sc->arg_fmt[arg->idx].mask_val(arg, val); + + return val; +} + static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, struct syscall_arg *arg, unsigned long val) { @@ -1416,14 +1532,18 @@ static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, } static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, - unsigned char *args, struct trace *trace, - struct thread *thread) + unsigned char *args, void *augmented_args, int augmented_args_size, + struct trace *trace, struct thread *thread) { size_t printed = 0; unsigned long val; u8 bit = 1; struct syscall_arg arg = { .args = args, + .augmented = { + .size = augmented_args_size, + .args = augmented_args, + }, .idx = 0, .mask = 0, .trace = trace, @@ -1439,7 +1559,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, ttrace->ret_scnprintf = NULL; if (sc->args != NULL) { - struct format_field *field; + struct tep_format_field *field; for (field = sc->args; field; field = field->next, ++arg.idx, bit <<= 1) { @@ -1447,6 +1567,11 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, continue; val = syscall_arg__val(&arg, arg.idx); + /* + * Some syscall args need some mask, most don't and + * return val untouched. + */ + val = syscall__mask_val(sc, &arg, val); /* * Suppress this argument if its value is zero and @@ -1578,6 +1703,8 @@ static int trace__printf_interrupted_entry(struct trace *trace) printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); ttrace->entry_pending = false; + ++trace->nr_events_printed; + return printed; } @@ -1598,6 +1725,32 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel, return printed; } +static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, bool raw_augmented) +{ + void *augmented_args = NULL; + /* + * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter + * and there we get all 6 syscall args plus the tracepoint common + * fields (sizeof(long)) and the syscall_nr (another long). So we check + * if that is the case and if so don't look after the sc->args_size, + * but always after the full raw_syscalls:sys_enter payload, which is + * fixed. + * + * We'll revisit this later to pass s->args_size to the BPF augmenter + * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it + * copies only what we need for each syscall, like what happens when we + * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace + * traffic to just what is needed for each syscall. + */ + int args_size = raw_augmented ? (8 * (int)sizeof(long)) : sc->args_size; + + *augmented_args_size = sample->raw_size - args_size; + if (*augmented_args_size > 0) + augmented_args = sample->raw_data + args_size; + + return augmented_args; +} + static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) @@ -1607,6 +1760,8 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, size_t printed = 0; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; + int augmented_args_size = 0; + void *augmented_args = NULL; struct syscall *sc = trace__syscall_info(trace, evsel, id); struct thread_trace *ttrace; @@ -1630,13 +1785,24 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) trace__printf_interrupted_entry(trace); - + /* + * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible + * arguments, even if the syscall being handled, say "openat", uses only 4 arguments + * this breaks syscall__augmented_args() check for augmented args, as we calculate + * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file, + * so when handling, say the openat syscall, we end up getting 6 args for the + * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly + * thinking that the extra 2 u64 args are the augmented filename, so just check + * here and avoid using augmented syscalls when the evsel is the raw_syscalls one. + */ + if (evsel != trace->syscalls.events.sys_enter) + augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls); ttrace->entry_time = sample->time; msg = ttrace->entry_str; printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, - args, trace, thread); + args, augmented_args, augmented_args_size, trace, thread); if (sc->is_exit) { if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) { @@ -1659,6 +1825,39 @@ out_put: return err; } +static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel, + struct perf_sample *sample) +{ + struct thread_trace *ttrace; + struct thread *thread; + int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; + struct syscall *sc = trace__syscall_info(trace, evsel, id); + char msg[1024]; + void *args, *augmented_args = NULL; + int augmented_args_size; + + if (sc == NULL) + return -1; + + thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); + ttrace = thread__trace(thread, trace->output); + /* + * We need to get ttrace just to make sure it is there when syscall__scnprintf_args() + * and the rest of the beautifiers accessing it via struct syscall_arg touches it. + */ + if (ttrace == NULL) + goto out_put; + + args = perf_evsel__sc_tp_ptr(evsel, args, sample); + augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls); + syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); + fprintf(trace->output, "%s", msg); + err = 0; +out_put: + thread__put(thread); + return err; +} + static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, struct perf_sample *sample, struct callchain_cursor *cursor) @@ -1667,12 +1866,14 @@ static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evse int max_stack = evsel->attr.sample_max_stack ? evsel->attr.sample_max_stack : trace->max_stack; + int err; - if (machine__resolve(trace->host, &al, sample) < 0 || - thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack)) + if (machine__resolve(trace->host, &al, sample) < 0) return -1; - return 0; + err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack); + addr_location__put(&al); + return err; } static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) @@ -1720,7 +1921,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, ret = perf_evsel__sc_tp_uint(evsel, ret, sample); - if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) { + if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) { trace__set_fd_pathname(thread, ret, ttrace->filename.name); ttrace->filename.pending_open = false; ++trace->stats.vfs_getname; @@ -1797,6 +1998,13 @@ errno_print: { fputc('\n', trace->output); + /* + * We only consider an 'event' for the sake of --max-events a non-filtered + * sys_enter + sys_exit and other tracepoint events. + */ + if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX) + interrupted = true; + if (callchain_ret > 0) trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) @@ -1929,13 +2137,25 @@ static void bpf_output__fprintf(struct trace *trace, { binary__fprintf(sample->raw_data, sample->raw_size, 8, bpf_output__printer, NULL, trace->output); + ++trace->nr_events_printed; } static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) { + struct thread *thread; int callchain_ret = 0; + /* + * Check if we called perf_evsel__disable(evsel) due to, for instance, + * this event's max_events having been hit and this is an entry coming + * from the ring buffer that we should discard, since the max events + * have already been considered/printed. + */ + if (evsel->disabled) + return 0; + + thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); @@ -1952,16 +2172,47 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, if (trace->trace_syscalls) fprintf(trace->output, "( ): "); + if (thread) + trace__fprintf_comm_tid(trace, thread, trace->output); + + if (evsel == trace->syscalls.events.augmented) { + int id = perf_evsel__sc_tp_uint(evsel, id, sample); + struct syscall *sc = trace__syscall_info(trace, evsel, id); + + if (sc) { + fprintf(trace->output, "%s(", sc->name); + trace__fprintf_sys_enter(trace, evsel, sample); + fputc(')', trace->output); + goto newline; + } + + /* + * XXX: Not having the associated syscall info or not finding/adding + * the thread should never happen, but if it does... + * fall thru and print it as a bpf_output event. + */ + } + fprintf(trace->output, "%s:", evsel->name); if (perf_evsel__is_bpf_output(evsel)) { bpf_output__fprintf(trace, sample); } else if (evsel->tp_format) { - event_format__fprintf(evsel->tp_format, sample->cpu, - sample->raw_data, sample->raw_size, - trace->output); + if (strncmp(evsel->tp_format->name, "sys_enter_", 10) || + trace__fprintf_sys_enter(trace, evsel, sample)) { + event_format__fprintf(evsel->tp_format, sample->cpu, + sample->raw_data, sample->raw_size, + trace->output); + ++trace->nr_events_printed; + + if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) { + perf_evsel__disable(evsel); + perf_evsel__close(evsel); + } + } } +newline: fprintf(trace->output, "\n"); if (callchain_ret > 0) @@ -1969,6 +2220,7 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, else if (callchain_ret < 0) pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); out: + thread__put(thread); return 0; } @@ -2024,8 +2276,7 @@ static int trace__pgfault(struct trace *trace, if (trace->summary_only) goto out; - thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION, - sample->ip, &al); + thread__find_symbol(thread, sample->cpumode, sample->ip, &al); trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output); @@ -2037,12 +2288,10 @@ static int trace__pgfault(struct trace *trace, fprintf(trace->output, "] => "); - thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE, - sample->addr, &al); + thread__find_symbol(thread, sample->cpumode, sample->addr, &al); if (!al.map) { - thread__find_addr_location(thread, sample->cpumode, - MAP__FUNCTION, sample->addr, &al); + thread__find_symbol(thread, sample->cpumode, sample->addr, &al); if (al.map) map_type = 'x'; @@ -2058,6 +2307,8 @@ static int trace__pgfault(struct trace *trace, trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); + + ++trace->nr_events_printed; out: err = 0; out_put: @@ -2235,6 +2486,9 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st tracepoint_handler handler = evsel->handler; handler(trace, evsel, event, sample); } + + if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX) + interrupted = true; } static int trace__add_syscall_newtp(struct trace *trace) @@ -2243,14 +2497,14 @@ static int trace__add_syscall_newtp(struct trace *trace) struct perf_evlist *evlist = trace->evlist; struct perf_evsel *sys_enter, *sys_exit; - sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter); + sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter); if (sys_enter == NULL) goto out; if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) goto out_delete_sys_enter; - sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit); + sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit); if (sys_exit == NULL) goto out_delete_sys_enter; @@ -2494,7 +2748,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) * to override an explicitely set --max-stack global setting. */ evlist__for_each_entry(evlist, evsel) { - if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + if (evsel__has_callchain(evsel) && evsel->attr.sample_max_stack == 0) evsel->attr.sample_max_stack = trace->max_stack; } @@ -2539,7 +2793,7 @@ next_event: int timeout = done ? 100 : -1; if (!draining && perf_evlist__poll(evlist, timeout) > 0) { - if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) + if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0) draining = true; goto again; @@ -2672,7 +2926,7 @@ static int trace__replay(struct trace *trace) "syscalls:sys_enter"); if (evsel && - (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 || + (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 || perf_evsel__init_sc_tp_ptr_field(evsel, args))) { pr_err("Error during initialize raw_syscalls:sys_enter event\n"); goto out; @@ -2684,7 +2938,7 @@ static int trace__replay(struct trace *trace) evsel = perf_evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit"); if (evsel && - (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 || + (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 || perf_evsel__init_sc_tp_uint_field(evsel, ret))) { pr_err("Error during initialize raw_syscalls:sys_exit event\n"); goto out; @@ -2924,6 +3178,36 @@ static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) evsel->handler = handler; } +static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + if (evsel->priv || !evsel->tp_format) + continue; + + if (strcmp(evsel->tp_format->system, "syscalls")) + continue; + + if (perf_evsel__init_syscall_tp(evsel)) + return -1; + + if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) { + struct syscall_tp *sc = evsel->priv; + + if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64))) + return -1; + } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) { + struct syscall_tp *sc = evsel->priv; + + if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap)) + return -1; + } + } + + return 0; +} + /* * XXX: Hackish, just splitting the combined -e+--event (syscalls * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use @@ -2941,6 +3225,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; + struct syscall_fmt *fmt; if (strace_groups_dir == NULL) return -1; @@ -2958,12 +3243,19 @@ static int trace__parse_events_option(const struct option *opt, const char *str, if (syscalltbl__id(trace->sctbl, s) >= 0 || syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { list = 1; + goto do_concat; + } + + fmt = syscall_fmt__find_by_alias(s); + if (fmt != NULL) { + list = 1; + s = fmt->name; } else { path__join(group_name, sizeof(group_name), strace_groups_dir, s); if (access(group_name, R_OK) == 0) list = 1; } - +do_concat: if (lists[list]) { sprintf(lists[list] + strlen(lists[list]), ",%s", s); } else { @@ -2993,6 +3285,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, if (trace__validate_ev_qualifier(trace)) goto out; + trace->trace_syscalls = true; } err = 0; @@ -3048,9 +3341,10 @@ int cmd_trace(int argc, const char **argv) }, .output = stderr, .show_comm = true, - .trace_syscalls = true, + .trace_syscalls = false, .kernel_syscallchains = false, .max_stack = UINT_MAX, + .max_events = ULONG_MAX, }; const char *output_name = NULL; const struct option trace_options[] = { @@ -3103,6 +3397,8 @@ int cmd_trace(int argc, const char **argv) &record_parse_callchain_opt), OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, "Show the kernel callchains on the syscall exit path"), + OPT_ULONG(0, "max-events", &trace.max_events, + "Set the maximum number of events to print, exit after that is reached. "), OPT_UINTEGER(0, "min-stack", &trace.min_stack, "Set the minimum stack depth when parsing the callchain, " "anything below the specified depth will be ignored."), @@ -3123,8 +3419,9 @@ int cmd_trace(int argc, const char **argv) }; bool __maybe_unused max_stack_user_set = true; bool mmap_pages_user_set = true; + struct perf_evsel *evsel; const char * const trace_subcommands[] = { "record", NULL }; - int err; + int err = -1; char bf[BUFSIZ]; signal(SIGSEGV, sighandler_dump_stack); @@ -3147,6 +3444,16 @@ int cmd_trace(int argc, const char **argv) "cgroup monitoring only available in system-wide mode"); } + evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__"); + if (IS_ERR(evsel)) { + bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf)); + pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf); + goto out; + } + + if (evsel) + trace.syscalls.events.augmented = evsel; + err = bpf__setup_stdout(trace.evlist); if (err) { bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); @@ -3165,7 +3472,7 @@ int cmd_trace(int argc, const char **argv) mmap_pages_user_set = false; if (trace.max_stack == UINT_MAX) { - trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack; + trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack(); max_stack_user_set = false; } @@ -3182,8 +3489,49 @@ int cmd_trace(int argc, const char **argv) symbol_conf.use_callchain = true; } - if (trace.evlist->nr_entries > 0) + if (trace.evlist->nr_entries > 0) { evlist__set_evsel_handler(trace.evlist, trace__event_handler); + if (evlist__set_syscall_tp_fields(trace.evlist)) { + perror("failed to set syscalls:* tracepoint fields"); + goto out; + } + } + + /* + * If we are augmenting syscalls, then combine what we put in the + * __augmented_syscalls__ BPF map with what is in the + * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF, + * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit. + * + * We'll switch to look at two BPF maps, one for sys_enter and the + * other for sys_exit when we start augmenting the sys_exit paths with + * buffers that are being copied from kernel to userspace, think 'read' + * syscall. + */ + if (trace.syscalls.events.augmented) { + evsel = trace.syscalls.events.augmented; + + if (perf_evsel__init_augmented_syscall_tp(evsel) || + perf_evsel__init_augmented_syscall_tp_args(evsel)) + goto out; + evsel->handler = trace__sys_enter; + + evlist__for_each_entry(trace.evlist, evsel) { + bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0; + + if (raw_syscalls_sys_exit) { + trace.raw_augmented_syscalls = true; + goto init_augmented_syscall_tp; + } + + if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) { +init_augmented_syscall_tp: + perf_evsel__init_augmented_syscall_tp(evsel); + perf_evsel__init_augmented_syscall_tp_ret(evsel); + evsel->handler = trace__sys_exit; + } + } + } if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) return trace__record(&trace, argc-1, &argv[1]); @@ -3194,13 +3542,7 @@ int cmd_trace(int argc, const char **argv) if (!trace.trace_syscalls && !trace.trace_pgfaults && trace.evlist->nr_entries == 0 /* Was --events used? */) { - pr_err("Please specify something to trace.\n"); - return -1; - } - - if (!trace.trace_syscalls && trace.ev_qualifier) { - pr_err("The -e option can't be used with --no-syscalls.\n"); - goto out; + trace.trace_syscalls = true; } if (output_name != NULL) { @@ -3211,8 +3553,6 @@ int cmd_trace(int argc, const char **argv) } } - trace.open_id = syscalltbl__id(trace.sctbl, "open"); - err = target__validate(&trace.opts.target); if (err) { target__strerror(&trace.opts.target, err, bf, sizeof(bf)); |