diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/blktrace.c | 279 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 82 | ||||
-rw-r--r-- | kernel/trace/trace.c | 32 | ||||
-rw-r--r-- | kernel/trace/trace.h | 3 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 17 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_functions_graph.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_mmiotrace.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 21 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 8 | ||||
-rw-r--r-- | kernel/trace/trace_selftest.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 15 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 53 |
13 files changed, 359 insertions, 158 deletions
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc364f86100a..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -27,6 +27,7 @@ #include <linux/time.h> #include <linux/uaccess.h> #include <linux/list.h> +#include <linux/blk-cgroup.h> #include "../../block/blk.h" @@ -46,10 +47,16 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 +#define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, +#ifdef CONFIG_BLK_CGROUP + { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, +#endif { } }; @@ -68,7 +75,8 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) + const void *data, size_t len, + union kernfs_node_id *cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -76,12 +84,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len, + sizeof(*t) + len + cgid_len, 0, pc); if (!event) return; @@ -92,17 +101,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len); + t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; - t->action = action; + t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); + t->pdu_len = len + cgid_len; + if (cgid) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -122,7 +133,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm)); + sizeof(tsk->comm), NULL); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,11 +150,12 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, + const char *fmt, ...) { int n; va_list args; @@ -167,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + blkcg = NULL; +#ifdef CONFIG_BLK_CGROUP + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); +#else + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -204,7 +223,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data) + void *pdu_data, union kernfs_node_id *cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -215,6 +234,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -229,6 +249,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); + if (cgid) + what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -241,7 +263,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len, + sizeof(*t) + pdu_len + cgid_len, 0, pc); if (!event) return; @@ -258,7 +280,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -280,10 +302,12 @@ record_it: t->action = what; t->device = bt->dev; t->error = error; - t->pdu_len = pdu_len; + t->pdu_len = pdu_len + cgid_len; + if (cgid_len) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -359,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, "%s", msg); + __trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; @@ -624,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start) } EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device @@ -641,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); switch (cmd) { case BLKTRACESETUP: @@ -667,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); return ret; } @@ -684,6 +714,36 @@ void blk_trace_shutdown(struct request_queue *q) } } +#ifdef CONFIG_BLK_CGROUP +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + struct blk_trace *bt = q->blk_trace; + + if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + return NULL; + + if (!bio->bi_css) + return NULL; + return cgroup_get_kernfs_id(bio->bi_css->cgroup); +} +#else +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + return NULL; +} +#endif + +static union kernfs_node_id * +blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +{ + if (!rq->bio) + return NULL; + /* Use the first bio */ + return blk_trace_bio_get_cgid(q, rq->bio); +} + /* * blktrace probes */ @@ -694,13 +754,15 @@ void blk_trace_shutdown(struct request_queue *q) * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action + * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what) + unsigned int nr_bytes, u32 what, + union kernfs_node_id *cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -713,32 +775,36 @@ static void blk_add_trace_rq(struct request *rq, int error, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, error, 0, NULL); + rq->cmd_flags, what, error, 0, NULL, cgid); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { - blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, + blk_trace_request_get_cgid(rq->q, rq)); } /** @@ -753,7 +819,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u32 what, int error, union kernfs_node_id *cgid) { struct blk_trace *bt = q->blk_trace; @@ -761,20 +827,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -782,7 +850,8 @@ static void blk_add_trace_bio_backmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_frontmerge(void *ignore, @@ -790,13 +859,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_getrq(void *ignore, @@ -804,13 +875,14 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL); + NULL, NULL); } } @@ -820,13 +892,14 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL); + 0, 0, NULL, NULL); } } @@ -835,7 +908,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -852,7 +925,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); } } @@ -868,7 +941,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu); + &rpdu, blk_trace_bio_get_cgid(q, bio)); } } @@ -896,12 +969,12 @@ static void blk_add_trace_bio_remap(void *ignore, return; r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, - sizeof(r), &r); + sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); } /** @@ -934,7 +1007,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r); + sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); } /** @@ -958,7 +1031,8 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, 0, len, data); + BLK_TA_DRV_DATA, 0, len, data, + blk_trace_request_get_cgid(q, rq)); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1031,7 +1105,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) int i = 0; int tc = t->action >> BLK_TC_SHIFT; - if (t->action == BLK_TN_MESSAGE) { + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } @@ -1066,9 +1140,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) return (const struct blk_io_trace *)ent; } -static inline const void *pdu_start(const struct trace_entry *ent) +static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) +{ + return (void *)(te_blk_io_trace(ent) + 1) + + (has_cg ? sizeof(union kernfs_node_id) : 0); +} + +static inline const void *cgid_start(const struct trace_entry *ent) { - return te_blk_io_trace(ent) + 1; + return (void *)(te_blk_io_trace(ent) + 1); +} + +static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) +{ + return te_blk_io_trace(ent)->pdu_len - + (has_cg ? sizeof(union kernfs_node_id) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1096,16 +1182,16 @@ static inline __u16 t_error(const struct trace_entry *ent) return te_blk_io_trace(ent)->error; } -static __u64 get_pdu_int(const struct trace_entry *ent) +static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent); + const __u64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) + struct blk_io_trace_remap *r, bool has_cg) { - const struct blk_io_trace_remap *__r = pdu_start(ent); + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); __u64 sector_from = __r->sector_from; r->device_from = be32_to_cpu(__r->device_from); @@ -1113,9 +1199,11 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, + bool has_cg); -static void blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1131,24 +1219,43 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act) secs, nsec_rem, iter->ent->pid, act, rwbs); } -static void blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); -} - -static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) + if (has_cg) { + const union kernfs_node_id *id = cgid_start(iter->ent); + + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else + trace_seq_printf(&iter->seq, + "%3d,%-3d %x,%-x %2s %3s ", + MAJOR(t->device), MINOR(t->device), + id->ino, id->generation, act, rwbs); + } else + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static void blk_log_dump_pdu(struct trace_seq *s, + const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; - pdu_buf = pdu_start(ent); - pdu_len = te_blk_io_trace(ent)->pdu_len; + pdu_buf = pdu_start(ent, has_cg); + pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; @@ -1179,7 +1286,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) trace_seq_puts(s, ") "); } -static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1187,7 +1294,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) @@ -1199,10 +1306,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) } static void blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) + const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) @@ -1215,18 +1322,18 @@ static void blk_log_with_error(struct trace_seq *s, } } -static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { struct blk_io_trace_remap r = { .device_from = 0, }; - get_pdu_remap(ent, &r); + get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(r.device_from), MINOR(r.device_from), (unsigned long long)r.sector_from); } -static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1235,30 +1342,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) trace_seq_printf(s, "[%s]\n", cmd); } -static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } -static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + get_pdu_int(ent, has_cg), cmd); } -static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg) { - const struct blk_io_trace *t = te_blk_io_trace(ent); - trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putmem(s, pdu_start(ent, has_cg), + pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } @@ -1298,7 +1406,8 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - void (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1326,23 +1435,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, u16 what; bool long_act; blk_log_action_t *log_action; + bool has_cg; t = te_blk_io_trace(iter->ent); - what = t->action & ((1 << BLK_TC_SHIFT) - 1); + what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; + has_cg = t->action & __BLK_TA_CGROUP; - if (t->action == BLK_TN_MESSAGE) { - log_action(iter, long_act ? "message" : "m"); - blk_log_msg(s, iter->ent); + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { + log_action(iter, long_act ? "message" : "m", has_cg); + blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { - log_action(iter, what2act[what].act[long_act]); - what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act], has_cg); + what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); @@ -1622,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!q->blk_trace); @@ -1641,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: @@ -1683,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { if (value) @@ -1709,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 96cea88fa00f..8319e09e15b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2828,13 +2828,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are per_cpu ops, they still need their - * per_cpu field freed. Since, function tracing is + * If these are dynamic or per_cpu ops, they still + * need their data freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_PER_CPU) - per_cpu_ops_free(ops); + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) + goto free_ops; + return 0; } @@ -2900,6 +2901,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (IS_ENABLED(CONFIG_PREEMPT)) synchronize_rcu_tasks(); + free_ops: arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) @@ -4952,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -5690,10 +5689,51 @@ static int referenced_filters(struct dyn_ftrace *rec) return cnt; } +static void +clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int i; + + if (ftrace_hash_empty(hash)) + return; + + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + entry = __ftrace_lookup_ip(hash, rec->ip); + /* + * Do not allow this rec to match again. + * Yeah, it may waste some memory, but will be removed + * if/when the hash is modified again. + */ + if (entry) + entry->ip = 0; + } +} + +/* Clear any records from hashs */ +static void clear_mod_from_hashes(struct ftrace_page *pg) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->ops || !tr->ops->func_hash) + continue; + mutex_lock(&tr->ops->func_hash->regex_lock); + clear_mod_from_hash(pg, tr->ops->func_hash->filter_hash); + clear_mod_from_hash(pg, tr->ops->func_hash->notrace_hash); + mutex_unlock(&tr->ops->func_hash->regex_lock); + } + mutex_unlock(&trace_types_lock); +} + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page **last_pg; + struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; int order; @@ -5723,14 +5763,25 @@ void ftrace_release_mod(struct module *mod) ftrace_update_tot_cnt -= pg->index; *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - free_pages((unsigned long)pg->records, order); - kfree(pg); + + pg->next = tmp_page; + tmp_page = pg; } else last_pg = &pg->next; } out_unlock: mutex_unlock(&ftrace_lock); + + for (pg = tmp_page; pg; pg = tmp_page) { + + /* Needs to be called outside of ftrace_lock */ + clear_mod_from_hashes(pg); + + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + tmp_page = pg->next; + kfree(pg); + } } void ftrace_module_enable(struct module *mod) @@ -6754,17 +6805,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 44004d8aa3b3..752e5daf0896 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1702,6 +1702,9 @@ void tracing_reset_all_online_cpus(void) struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->clear_trace) + continue; + tr->clear_trace = false; tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); @@ -2799,11 +2802,17 @@ static char *get_trace_buf(void) if (!buffer || buffer->nesting >= 4) return NULL; - return &buffer->buffer[buffer->nesting++][0]; + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting][0]; } static void put_trace_buf(void) { + /* Don't let the decrement of nesting leak before this */ + barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } @@ -4011,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); + struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, cpu); + tracing_reset(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { @@ -5349,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; + /* Some tracers won't work on kernel command line */ + if (system_state < SYSTEM_RUNNING && t->noboot) { + pr_warn("Tracer '%s' is not allowed on command line, ignored\n", + t->name); + goto out; + } + /* Some tracers are only allowed for the top level buffer */ if (!trace_ok_for_array(t, tr)) { ret = -EINVAL; @@ -5658,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); @@ -6220,7 +6242,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 490ba229931d..652c682707cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -245,6 +245,7 @@ struct trace_array { int stop_count; int clock_id; int nr_topts; + bool clear_trace; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; @@ -443,6 +444,8 @@ struct tracer { #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif + /* True if tracer cannot be enabled in kernel param */ + bool noboot; }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 36132f9280e6..87468398b9ed 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -406,7 +406,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { tracing_stop_tgid_record(); - clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); @@ -466,7 +466,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ - call->flags |= TRACE_EVENT_FL_WAS_ENABLED; + set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); } break; } @@ -2058,6 +2058,10 @@ static void event_remove(struct trace_event_call *call) do_for_each_event_file(tr, file) { if (file->event_call != call) continue; + + if (file->flags & EVENT_FILE_FL_WAS_ENABLED) + tr->clear_trace = true; + ftrace_event_enable_disable(file, 0); /* * The do_for_each_event_file() is @@ -2396,15 +2400,11 @@ static void trace_module_add_events(struct module *mod) static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; - bool clear_trace = false; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) { - if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) - clear_trace = true; + if (call->mod == mod) __trace_remove_event_call(call); - } } up_write(&trace_event_sem); @@ -2416,8 +2416,7 @@ static void trace_module_remove_events(struct module *mod) * over from this module may be passed to the new module events and * unexpected results may occur. */ - if (clear_trace) - tracing_reset_all_online_cpus(); + tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 181e139a8057..61e7f0678d33 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -702,7 +702,7 @@ static void append_filter_err(struct filter_parse_state *ps, int pos = ps->lasterr_pos; char *buf, *pbuf; - buf = (char *)__get_free_page(GFP_TEMPORARY); + buf = (char *)__get_free_page(GFP_KERNEL); if (!buf) return; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56123cdcc89..b8f1f54731af 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { - max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { pr_warn("Warning: could not register graph trace events\n"); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index cd7480d0a201..dca78fc48439 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly = .close = mmio_close, .read = mmio_read, .print_line = mmio_print_line, + .noboot = true, }; __init static int init_mmio_trace(void) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); + T = __task_state_to_char(field->next_state); + S = __task_state_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..0c331978b1a6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = prev->state; + entry->prev_state = __get_task_state(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = next->state; + entry->next_state = __get_task_state(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = curr->state; + entry->prev_state = __get_task_state(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = wakee->state; + entry->next_state = __get_task_state(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index cb917cebae29..b17ec642793b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -273,7 +273,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) goto out_free; if (cnt > 1) { if (trace_selftest_test_global_cnt == 0) - goto out; + goto out_free; } if (trace_selftest_test_dyn_cnt == 0) goto out_free; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..49cb41412eec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; - /* - * There's a slight chance that we are tracing inside the - * RCU infrastructure, and rcu_irq_enter() will not work - * as expected. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); - /* - * RCU may not be watching, make it see us. - * The stack trace code uses rcu_sched. - */ - rcu_irq_enter(); - /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack) } out: - rcu_irq_exit(); arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 74d9a86eccc0..696afe72d3b1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; +static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, + struct syscall_metadata *sys_data, + struct syscall_trace_enter *rec) { + struct syscall_tp_t { + unsigned long long regs; + unsigned long syscall_nr; + unsigned long args[SYSCALL_DEFINE_MAXARGS]; + } param; + int i; + + *(struct pt_regs **)¶m = regs; + param.syscall_nr = rec->nr; + for (i = 0; i < sys_data->nb_args; i++) + param.args[i] = rec->args[i]; + return trace_call_bpf(prog, ¶m); +} + static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; + struct bpf_prog *prog; int syscall_nr; int rctx; int size; @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + prog = READ_ONCE(sys_data->enter_event->prog); head = this_cpu_ptr(sys_data->enter_event->perf_events); - if (hlist_empty(head)) + if (!prog && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); + + if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) || + hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL, NULL); @@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call) mutex_unlock(&syscall_trace_lock); } +static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, + struct syscall_trace_exit *rec) { + struct syscall_tp_t { + unsigned long long regs; + unsigned long syscall_nr; + unsigned long ret; + } param; + + *(struct pt_regs **)¶m = regs; + param.syscall_nr = rec->nr; + param.ret = rec->ret; + return trace_call_bpf(prog, ¶m); +} + static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; struct hlist_head *head; + struct bpf_prog *prog; int syscall_nr; int rctx; int size; @@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; + prog = READ_ONCE(sys_data->exit_event->prog); head = this_cpu_ptr(sys_data->exit_event->perf_events); - if (hlist_empty(head)) + if (!prog && hlist_empty(head)) return; /* We can probably do that at build time */ @@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); + + if ((prog && !perf_call_bpf_exit(prog, regs, rec)) || + hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL, NULL); } |