diff options
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 652 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 4 | ||||
-rw-r--r-- | kernel/events/internal.h | 35 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 112 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 94 |
5 files changed, 696 insertions, 201 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 0463c1151bae..e453589da97c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -951,9 +951,9 @@ list_update_cgroup_event(struct perf_event *event, /* * Because cgroup events are always per-cpu events, - * this will always be called from the right CPU. + * @ctx == &cpuctx->ctx. */ - cpuctx = __get_cpu_context(ctx); + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); /* * Since setting cpuctx->cgrp is conditional on the current @cgrp @@ -979,7 +979,8 @@ list_update_cgroup_event(struct perf_event *event, cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; if (add) - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + list_add(cpuctx_entry, + per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); else list_del(cpuctx_entry); } @@ -1031,7 +1032,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { } -void +static inline void perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } @@ -1103,7 +1104,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpuctx->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } @@ -1121,7 +1122,7 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) if (!cpuctx->hrtimer_active) { cpuctx->hrtimer_active = 1; hrtimer_forward_now(timer, cpuctx->hrtimer_interval); - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); @@ -1887,6 +1888,104 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) ctx->generation++; } +static int +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) +{ + if (!has_aux(aux_event)) + return 0; + + if (!event->pmu->aux_output_match) + return 0; + + return event->pmu->aux_output_match(aux_event); +} + +static void put_event(struct perf_event *event); +static void event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx); + +static void perf_put_aux_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *iter; + + /* + * If event uses aux_event tear down the link + */ + if (event->aux_event) { + iter = event->aux_event; + event->aux_event = NULL; + put_event(iter); + return; + } + + /* + * If the event is an aux_event, tear down all links to + * it from other events. + */ + for_each_sibling_event(iter, event->group_leader) { + if (iter->aux_event != event) + continue; + + iter->aux_event = NULL; + put_event(event); + + /* + * If it's ACTIVE, schedule it out and put it into ERROR + * state so that we don't try to schedule it again. Note + * that perf_event_enable() will clear the ERROR status. + */ + event_sched_out(iter, cpuctx, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } +} + +static bool perf_need_aux_event(struct perf_event *event) +{ + return !!event->attr.aux_output || !!event->attr.aux_sample_size; +} + +static int perf_get_aux_event(struct perf_event *event, + struct perf_event *group_leader) +{ + /* + * Our group leader must be an aux event if we want to be + * an aux_output. This way, the aux event will precede its + * aux_output events in the group, and therefore will always + * schedule first. + */ + if (!group_leader) + return 0; + + /* + * aux_output and aux_sample_size are mutually exclusive. + */ + if (event->attr.aux_output && event->attr.aux_sample_size) + return 0; + + if (event->attr.aux_output && + !perf_aux_output_match(event, group_leader)) + return 0; + + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) + return 0; + + if (!atomic_long_inc_not_zero(&group_leader->refcount)) + return 0; + + /* + * Link aux_outputs to their aux event; this is undone in + * perf_group_detach() by perf_put_aux_event(). When the + * group in torn down, the aux_output events loose their + * link to the aux_event and can't schedule any more. + */ + event->aux_event = group_leader; + + return 1; +} + static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; @@ -1902,6 +2001,8 @@ static void perf_group_detach(struct perf_event *event) event->attach_state &= ~PERF_ATTACH_GROUP; + perf_put_aux_event(event); + /* * If this is a sibling, remove it from its group. */ @@ -2154,7 +2255,7 @@ static void __perf_event_disable(struct perf_event *event, * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through + * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). @@ -2581,6 +2682,25 @@ perf_install_in_context(struct perf_event_context *ctx, */ smp_store_release(&event->ctx, ctx); + /* + * perf_event_attr::disabled events will not run and can be initialized + * without IPI. Except when this is the first event for the context, in + * that case we need the magic of the IPI to set ctx->is_active. + * + * The IOC_ENABLE that is sure to follow the creation of a disabled + * event will issue the IPI and reprogram the hardware. + */ + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + raw_spin_lock_irq(&ctx->lock); + if (ctx->task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); + return; + } + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; @@ -3119,10 +3239,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + struct pmu *pmu = ctx->pmu; + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * PMU specific parts of task perf context can require + * additional synchronization. As an example of such + * synchronization see implementation details of Intel + * LBR call stack data profiling; + */ + if (pmu->swap_task_ctx) + pmu->swap_task_ctx(ctx, next_ctx); + else + swap(ctx->task_ctx_data, next_ctx->task_ctx_data); /* * RCU_INIT_POINTER here is safe because we've not @@ -3694,11 +3825,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) perf_event_groups_insert(&ctx->flexible_groups, event); } +/* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_first_active(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_context *ctx) { - return list_first_entry_or_null(&ctx->flexible_active, - struct perf_event, active_list); + struct perf_event *event; + + /* pick the first active flexible event */ + event = list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); + + /* if no active flexible event, pick the first event */ + if (!event) { + event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), + typeof(*event), group_node); + } + + return event; } static bool perf_rotate_context(struct perf_cpu_context *cpuctx) @@ -3723,9 +3866,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) - task_event = ctx_first_active(task_ctx); + task_event = ctx_event_to_rotate(task_ctx); if (cpu_rotate) - cpu_event = ctx_first_active(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(&cpuctx->ctx); /* * As per the order given at ctx_resched() first 'pop' task flexible @@ -4089,10 +4232,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) return NULL; __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } + if (task) + ctx->task = get_task_struct(task); ctx->pmu = pmu; return ctx; @@ -4134,8 +4275,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (!task) { /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); + err = perf_allow_cpu(&event->attr); + if (err) + return ERR_PTR(err); cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -4232,7 +4374,7 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); + struct perf_buffer *rb); static void detach_sb_event(struct perf_event *event) { @@ -4444,6 +4586,8 @@ static void _free_event(struct perf_event *event) unaccount_event(event); + security_perf_event_free(event); + if (event->rb) { /* * Can happen when we close an event with re-directed output. @@ -4897,6 +5041,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct perf_event_context *ctx; int ret; + ret = security_perf_event_read(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); @@ -4907,7 +5055,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct ring_buffer *rb; + struct perf_buffer *rb; __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); @@ -4934,6 +5082,24 @@ static void _perf_event_reset(struct perf_event *event) perf_event_update_userpage(event); } +/* Assume it's not an event with inherit set. */ +u64 perf_event_pause(struct perf_event *event, bool reset) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(event->attr.inherit); + _perf_event_disable(event); + count = local64_read(&event->count); + if (reset) + local64_set(&event->count, 0); + perf_event_ctx_unlock(event, ctx); + + return count; +} +EXPORT_SYMBOL_GPL(perf_event_pause); + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -5011,16 +5177,11 @@ static int perf_event_check_period(struct perf_event *event, u64 value) return event->pmu->check_period(event, value); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) +static int _perf_event_period(struct perf_event *event, u64 value) { - u64 value; - if (!is_sampling_event(event)) return -EINVAL; - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - if (!value) return -EINVAL; @@ -5038,6 +5199,19 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) return 0; } +int perf_event_period(struct perf_event *event, u64 value) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_period(event, value); + perf_event_ctx_unlock(event, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(perf_event_period); + static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) @@ -5081,8 +5255,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); + { + u64 value; + + if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) + return -EFAULT; + return _perf_event_period(event, value); + } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); @@ -5117,7 +5297,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return perf_event_set_bpf_prog(event, arg); case PERF_EVENT_IOC_PAUSE_OUTPUT: { - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5161,6 +5341,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct perf_event_context *ctx; long ret; + /* Treat ioctl like writes as it is likely a mutating operation. */ + ret = security_perf_event_write(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); @@ -5248,7 +5433,7 @@ static void calc_timer_values(struct perf_event *event, static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5280,7 +5465,7 @@ void __weak arch_perf_update_userpage( void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct ring_buffer *rb; + struct perf_buffer *rb; u64 enabled, running, now; rcu_read_lock(); @@ -5331,7 +5516,7 @@ EXPORT_SYMBOL_GPL(perf_event_update_userpage); static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; - struct ring_buffer *rb; + struct perf_buffer *rb; vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -5364,9 +5549,9 @@ unlock: } static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb) + struct perf_buffer *rb) { - struct ring_buffer *old_rb = NULL; + struct perf_buffer *old_rb = NULL; unsigned long flags; if (event->rb) { @@ -5424,7 +5609,7 @@ static void ring_buffer_attach(struct perf_event *event, static void ring_buffer_wakeup(struct perf_event *event) { - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5435,9 +5620,9 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_unlock(); } -struct ring_buffer *ring_buffer_get(struct perf_event *event) +struct perf_buffer *ring_buffer_get(struct perf_event *event) { - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5450,7 +5635,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -void ring_buffer_put(struct ring_buffer *rb) +void ring_buffer_put(struct perf_buffer *rb) { if (!refcount_dec_and_test(&rb->refcount)) return; @@ -5488,7 +5673,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct ring_buffer *rb = ring_buffer_get(event); + struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); @@ -5512,7 +5697,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ @@ -5585,7 +5770,8 @@ again: * undo the VM accounting. */ - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, + &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); @@ -5605,8 +5791,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; - struct ring_buffer *rb = NULL; unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; @@ -5623,6 +5809,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; + ret = security_perf_event_read(event); + if (ret) + return ret; + vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { @@ -5727,16 +5917,30 @@ accounting: */ user_lock_limit *= num_online_cpus(); - user_locked = atomic_long_read(&user->locked_vm) + user_extra; + user_locked = atomic_long_read(&user->locked_vm); + /* + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit + */ if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += user_extra; + + if (user_locked > user_lock_limit) { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ extra = user_locked - user_lock_limit; + user_extra -= extra; + } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; @@ -5971,7 +6175,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr, * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more - * precisly, but there's no way to get it safely under interrupt, + * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) @@ -6066,6 +6270,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, } } +static unsigned long perf_prepare_sample_aux(struct perf_event *event, + struct perf_sample_data *data, + size_t size) +{ + struct perf_event *sampler = event->aux_event; + struct perf_buffer *rb; + + data->aux_size = 0; + + if (!sampler) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) + goto out; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + goto out; + + /* + * If this is an NMI hit inside sampling code, don't take + * the sample. See also perf_aux_sample_output(). + */ + if (READ_ONCE(rb->aux_in_sampling)) { + data->aux_size = 0; + } else { + size = min_t(size_t, size, perf_aux_size(rb)); + data->aux_size = ALIGN(size, sizeof(u64)); + } + ring_buffer_put(rb); + +out: + return data->aux_size; +} + +long perf_pmu_snapshot_aux(struct perf_buffer *rb, + struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) +{ + unsigned long flags; + long ret; + + /* + * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler + * paths. If we start calling them in NMI context, they may race with + * the IRQ ones, that is, for example, re-starting an event that's just + * been stopped, which is why we're using a separate callback that + * doesn't change the event state. + * + * IRQs need to be disabled to prevent IPIs from racing with us. + */ + local_irq_save(flags); + /* + * Guard against NMI hits inside the critical section; + * see also perf_prepare_sample_aux(). + */ + WRITE_ONCE(rb->aux_in_sampling, 1); + barrier(); + + ret = event->pmu->snapshot_aux(event, handle, size); + + barrier(); + WRITE_ONCE(rb->aux_in_sampling, 0); + local_irq_restore(flags); + + return ret; +} + +static void perf_aux_sample_output(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + struct perf_event *sampler = event->aux_event; + struct perf_buffer *rb; + unsigned long pad; + long size; + + if (WARN_ON_ONCE(!sampler || !data->aux_size)) + return; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + return; + + size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); + + /* + * An error here means that perf_output_copy() failed (returned a + * non-zero surplus that it didn't copy), which in its current + * enlightened implementation is not possible. If that changes, we'd + * like to know. + */ + if (WARN_ON_ONCE(size < 0)) + goto out_put; + + /* + * The pad comes from ALIGN()ing data->aux_size up to u64 in + * perf_prepare_sample_aux(), so should not be more than that. + */ + pad = data->aux_size - size; + if (WARN_ON_ONCE(pad >= sizeof(u64))) + pad = 8; + + if (pad) { + u64 zero = 0; + perf_output_copy(handle, &zero, pad); + } + +out_put: + ring_buffer_put(rb); +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -6385,11 +6705,18 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_AUX) { + perf_output_put(handle, data->aux_size); + + if (data->aux_size) + perf_aux_sample_output(event, handle, data); + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; if (wakeup_events) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; int events = local_inc_return(&rb->events); if (events >= wakeup_events) { @@ -6533,7 +6860,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_STACK_USER) { /* - * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. @@ -6573,6 +6900,35 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); + + if (sample_type & PERF_SAMPLE_AUX) { + u64 size; + + header->size += sizeof(u64); /* size */ + + /* + * Given the 16bit nature of header::size, an AUX sample can + * easily overflow it, what with all the preceding sample bits. + * Make sure this doesn't happen by using up to U16_MAX bytes + * per sample in total (rounded down to 8 byte boundary). + */ + size = min_t(size_t, U16_MAX - header->size, + event->attr.aux_sample_size); + size = rounddown(size, 8); + size = perf_prepare_sample_aux(event, data, size); + + WARN_ON_ONCE(size + header->size > U16_MAX); + header->size += size; + } + /* + * If you're adding more sample types here, you likely need to do + * something about the overflowing header::size, like repurpose the + * lowest 3 bits of size, which should be always zero at the moment. + * This raises a more important question, do we really need 512k sized + * samples and why, so good argumentation is in order for whatever you + * do here next. + */ + WARN_ON_ONCE(header->size & 7); } static __always_inline int @@ -6803,7 +7159,7 @@ void perf_event_exec(void) } struct remote_output { - struct ring_buffer *rb; + struct perf_buffer *rb; int err; }; @@ -6811,7 +7167,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) { struct perf_event *parent = event->parent; struct remote_output *ro = data; - struct ring_buffer *rb = ro->rb; + struct perf_buffer *rb = ro->rb; struct stop_event_data sd = { .event = event, }; @@ -6839,7 +7195,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->pmu; + struct pmu *pmu = event->ctx->pmu; struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, @@ -7148,7 +7504,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, { struct path ns_path; struct inode *ns_inode; - void *error; + int error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { @@ -9491,7 +9847,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) period = max_t(u64, 10000, hwc->sample_period); } hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) @@ -9513,7 +9869,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hwc->hrtimer.function = perf_swevent_hrtimer; /* @@ -9924,7 +10280,7 @@ static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, const char *name, int type) { - int cpu, ret; + int cpu, ret, max = PERF_TYPE_MAX; mutex_lock(&pmus_lock); ret = -ENOMEM; @@ -9937,12 +10293,17 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto skip_type; pmu->name = name; - if (type < 0) { - type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); - if (type < 0) { - ret = type; + if (type != PERF_TYPE_SOFTWARE) { + if (type >= 0) + max = type; + + ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); + if (ret < 0) goto free_pdc; - } + + WARN_ON(type >= 0 && ret != type); + + type = ret; } pmu->type = type; @@ -10019,7 +10380,16 @@ got_cpu_context: if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; - list_add_rcu(&pmu->entry, &pmus); + /* + * Ensure the TYPE_SOFTWARE PMUs are at the head of the list, + * since these cannot be in the IDR. This way the linear search + * is fast, provided a valid software event is provided. + */ + if (type == PERF_TYPE_SOFTWARE || !name) + list_add_rcu(&pmu->entry, &pmus); + else + list_add_tail_rcu(&pmu->entry, &pmus); + atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: @@ -10032,7 +10402,7 @@ free_dev: put_device(pmu->dev); free_idr: - if (pmu->type >= PERF_TYPE_MAX) + if (pmu->type != PERF_TYPE_SOFTWARE) idr_remove(&pmu_idr, pmu->type); free_pdc: @@ -10054,7 +10424,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - if (pmu->type >= PERF_TYPE_MAX) + if (pmu->type != PERF_TYPE_SOFTWARE) idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running) { if (pmu->nr_addr_filters) @@ -10124,9 +10494,8 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { + int idx, type, ret; struct pmu *pmu; - int idx; - int ret; idx = srcu_read_lock(&pmus_srcu); @@ -10138,17 +10507,32 @@ static struct pmu *perf_init_event(struct perf_event *event) goto unlock; } + /* + * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * are often aliases for PERF_TYPE_RAW. + */ + type = event->attr.type; + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) + type = PERF_TYPE_RAW; + +again: rcu_read_lock(); - pmu = idr_find(&pmu_idr, event->attr.type); + pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { ret = perf_try_init_event(pmu, event); + if (ret == -ENOENT && event->attr.type != type) { + type = event->attr.type; + goto again; + } + if (ret) pmu = ERR_PTR(ret); + goto unlock; } - list_for_each_entry_rcu(pmu, &pmus, entry) { + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; @@ -10355,8 +10739,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ - get_task_struct(task); - event->hw.target = task; + event->hw.target = get_task_struct(task); } event->clock = &local_clock; @@ -10368,12 +10751,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { - struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + struct bpf_prog *prog = parent_event->prog; - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto err_ns; - } + bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; @@ -10426,6 +10806,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + /* + * Disallow uncore-cgroup events, they don't make sense as the cgroup will + * be different on other CPUs in the uncore mask. + */ + if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + err = -EINVAL; + goto err_pmu; + } + + if (event->attr.aux_output && + !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { + err = -EOPNOTSUPP; + goto err_pmu; + } + err = exclusive_event_init(event); if (err) goto err_pmu; @@ -10465,11 +10860,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + err = security_perf_event_alloc(event); + if (err) + goto err_callchain_buffer; + /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; +err_callchain_buffer: + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } err_addr_filters: kfree(event->addr_filter_ranges); @@ -10498,58 +10902,29 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, u32 size; int ret; - if (!access_ok(uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ + /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ + /* ABI compatibility quirk: */ + if (!size) size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) + if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - attr->size = size; - if (attr->__reserved_1) + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -10587,9 +10962,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { + ret = perf_allow_kernel(attr); + if (ret) + return ret; + } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { @@ -10630,7 +11007,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct ring_buffer *rb = NULL; + struct perf_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) @@ -10802,13 +11179,19 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + if (err) + return err; + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + err = perf_allow_kernel(&attr); + if (err) + return err; } if (attr.namespaces) { @@ -10825,9 +11208,18 @@ SYSCALL_DEFINE5(perf_event_open, } /* Only privileged users can get physical addresses */ - if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && - perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { + err = perf_allow_kernel(&attr); + if (err) + return err; + } + + err = security_locked_down(LOCKDOWN_PERF); + if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) + /* REGS_INTR can leak data, lockdown must prevent this */ + return err; + + err = 0; /* * In cgroup mode, the pid argument is used to pass the fd @@ -11082,6 +11474,10 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { + err = -EINVAL; + goto err_locked; + } /* * Must be under the same ctx::mutex as perf_install_in_context(), @@ -11228,8 +11624,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, int err; /* - * Get the target context (task or percpu): + * Grouping is not supported for kernel events, neither is 'AUX', + * make sure the caller's intentions are adjusted. */ + if (attr->aux_output) + return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); @@ -11241,6 +11640,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); @@ -11692,7 +12094,7 @@ inherit_event(struct perf_event *parent_event, GFP_KERNEL); if (!child_ctx->task_ctx_data) { free_event(child_event); - return NULL; + return ERR_PTR(-ENOMEM); } } @@ -11794,6 +12196,10 @@ static int inherit_group(struct perf_event *parent_event, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); + + if (sub->aux_event == parent_event && child_ctr && + !perf_get_aux_event(child_ctr, leader)) + return -EINVAL; } return 0; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c5cd852fe86b..3cc8416ec844 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp, int register_perf_hw_breakpoint(struct perf_event *bp) { - struct arch_hw_breakpoint hw; + struct arch_hw_breakpoint hw = { }; int err; err = reserve_bp_slot(bp); @@ -461,7 +461,7 @@ int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { - struct arch_hw_breakpoint hw; + struct arch_hw_breakpoint hw = { }; int err; err = hw_breakpoint_parse(bp, attr, &hw); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 3aef4191798c..f16f66b6b655 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -10,7 +10,7 @@ #define RING_BUFFER_WRITABLE 0x01 -struct ring_buffer { +struct perf_buffer { refcount_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC @@ -50,6 +50,7 @@ struct ring_buffer { unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; + int aux_in_sampling; void **aux_pages; void *aux_priv; @@ -57,17 +58,17 @@ struct ring_buffer { void *data_pages[0]; }; -extern void rb_free(struct ring_buffer *rb); +extern void rb_free(struct perf_buffer *rb); static inline void rb_free_rcu(struct rcu_head *rcu_head) { - struct ring_buffer *rb; + struct perf_buffer *rb; - rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb = container_of(rcu_head, struct perf_buffer, rcu_head); rb_free(rb); } -static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) +static inline void rb_toggle_paused(struct perf_buffer *rb, bool pause) { if (!pause && rb->nr_pages) rb->paused = 0; @@ -75,16 +76,16 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) rb->paused = 1; } -extern struct ring_buffer * +extern struct perf_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); -extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, +extern int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags); -extern void rb_free_aux(struct ring_buffer *rb); -extern struct ring_buffer *ring_buffer_get(struct perf_event *event); -extern void ring_buffer_put(struct ring_buffer *rb); +extern void rb_free_aux(struct perf_buffer *rb); +extern struct perf_buffer *ring_buffer_get(struct perf_event *event); +extern void ring_buffer_put(struct perf_buffer *rb); -static inline bool rb_has_aux(struct ring_buffer *rb) +static inline bool rb_has_aux(struct perf_buffer *rb) { return !!rb->aux_nr_pages; } @@ -93,7 +94,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags); extern struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); +perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff); #ifdef CONFIG_PERF_USE_VMALLOC /* @@ -102,25 +103,25 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); * Required for architectures that have d-cache aliasing issues. */ -static inline int page_order(struct ring_buffer *rb) +static inline int page_order(struct perf_buffer *rb) { return rb->page_order; } #else -static inline int page_order(struct ring_buffer *rb) +static inline int page_order(struct perf_buffer *rb) { return 0; } #endif -static inline unsigned long perf_data_size(struct ring_buffer *rb) +static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } -static inline unsigned long perf_aux_size(struct ring_buffer *rb) +static inline unsigned long perf_aux_size(struct perf_buffer *rb) { return rb->aux_nr_pages << PAGE_SHIFT; } @@ -140,7 +141,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) buf += written; \ handle->size -= written; \ if (!handle->size) { \ - struct ring_buffer *rb = handle->rb; \ + struct perf_buffer *rb = handle->rb; \ \ handle->page++; \ handle->page &= rb->nr_pages - 1; \ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ffb59a4ef4ff..192b8abc6330 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -35,7 +35,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) */ static void perf_output_get_handle(struct perf_output_handle *handle) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; preempt_disable(); @@ -49,7 +49,7 @@ static void perf_output_get_handle(struct perf_output_handle *handle) static void perf_output_put_handle(struct perf_output_handle *handle) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; unsigned long head; unsigned int nest; @@ -150,7 +150,7 @@ __perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, bool backward) { - struct ring_buffer *rb; + struct perf_buffer *rb; unsigned long tail, offset, head; int have_lost, page_shift; struct { @@ -301,7 +301,7 @@ void perf_output_end(struct perf_output_handle *handle) } static void -ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) +ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) { long max_size = perf_data_size(rb); @@ -361,7 +361,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, { struct perf_event *output_event = event; unsigned long aux_head, aux_tail; - struct ring_buffer *rb; + struct perf_buffer *rb; unsigned int nest; if (output_event->parent) @@ -449,7 +449,7 @@ err: } EXPORT_SYMBOL_GPL(perf_aux_output_begin); -static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb) +static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb) { if (rb->aux_overwrite) return false; @@ -475,7 +475,7 @@ static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb) void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; unsigned long aux_head; /* in overwrite mode, driver provides aux_head via handle */ @@ -532,7 +532,7 @@ EXPORT_SYMBOL_GPL(perf_aux_output_end); */ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; if (size > handle->size) return -ENOSPC; @@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle) } EXPORT_SYMBOL_GPL(perf_get_aux); +/* + * Copy out AUX data from an AUX handle. + */ +long perf_output_copy_aux(struct perf_output_handle *aux_handle, + struct perf_output_handle *handle, + unsigned long from, unsigned long to) +{ + struct perf_buffer *rb = aux_handle->rb; + unsigned long tocopy, remainder, len = 0; + void *addr; + + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + + do { + tocopy = PAGE_SIZE - offset_in_page(from); + if (to > from) + tocopy = min(tocopy, to - from); + if (!tocopy) + break; + + addr = rb->aux_pages[from >> PAGE_SHIFT]; + addr += offset_in_page(from); + + remainder = perf_output_copy(handle, addr, tocopy); + if (remainder) + return -EFAULT; + + len += tocopy; + from += tocopy; + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + } while (to != from); + + return len; +} + #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) @@ -590,7 +626,7 @@ static struct page *rb_alloc_aux_page(int node, int order) return page; } -static void rb_free_aux_page(struct ring_buffer *rb, int idx) +static void rb_free_aux_page(struct perf_buffer *rb, int idx) { struct page *page = virt_to_page(rb->aux_pages[idx]); @@ -599,7 +635,7 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) __free_page(page); } -static void __rb_free_aux(struct ring_buffer *rb) +static void __rb_free_aux(struct perf_buffer *rb) { int pg; @@ -626,7 +662,7 @@ static void __rb_free_aux(struct ring_buffer *rb) } } -int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, +int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); @@ -717,7 +753,7 @@ out: return ret; } -void rb_free_aux(struct ring_buffer *rb) +void rb_free_aux(struct perf_buffer *rb) { if (refcount_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); @@ -730,7 +766,7 @@ void rb_free_aux(struct ring_buffer *rb) */ static struct page * -__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; @@ -754,13 +790,21 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } -struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +static void perf_mmap_free_page(void *addr) { - struct ring_buffer *rb; + struct page *page = virt_to_page(addr); + + page->mapping = NULL; + __free_page(page); +} + +struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct perf_buffer *rb; unsigned long size; int i; - size = sizeof(struct ring_buffer); + size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) @@ -788,9 +832,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) fail_data_pages: for (i--; i >= 0; i--) - free_page((unsigned long)rb->data_pages[i]); + perf_mmap_free_page(rb->data_pages[i]); - free_page((unsigned long)rb->user_page); + perf_mmap_free_page(rb->user_page); fail_user_page: kfree(rb); @@ -799,32 +843,24 @@ fail: return NULL; } -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -void rb_free(struct ring_buffer *rb) +void rb_free(struct perf_buffer *rb) { int i; - perf_mmap_free_page((unsigned long)rb->user_page); + perf_mmap_free_page(rb->user_page); for (i = 0; i < rb->nr_pages; i++) - perf_mmap_free_page((unsigned long)rb->data_pages[i]); + perf_mmap_free_page(rb->data_pages[i]); kfree(rb); } #else -static int data_page_nr(struct ring_buffer *rb) +static int data_page_nr(struct perf_buffer *rb) { return rb->nr_pages << page_order(rb); } static struct page * -__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) @@ -842,11 +878,11 @@ static void perf_mmap_unmark_page(void *addr) static void rb_free_work(struct work_struct *work) { - struct ring_buffer *rb; + struct perf_buffer *rb; void *base; int i, nr; - rb = container_of(work, struct ring_buffer, work); + rb = container_of(work, struct perf_buffer, work); nr = data_page_nr(rb); base = rb->user_page; @@ -858,18 +894,18 @@ static void rb_free_work(struct work_struct *work) kfree(rb); } -void rb_free(struct ring_buffer *rb) +void rb_free(struct perf_buffer *rb) { schedule_work(&rb->work); } -struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct ring_buffer *rb; + struct perf_buffer *rb; unsigned long size; void *all_buf; - size = sizeof(struct ring_buffer); + size = sizeof(struct perf_buffer); size += sizeof(void *); rb = kzalloc(size, GFP_KERNEL); @@ -903,7 +939,7 @@ fail: #endif struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { if (rb->aux_nr_pages) { /* above AUX space */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 84fa00497c49..ece7e13f6e4a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> +#include <linux/khugepaged.h> #include <linux/uprobes.h> @@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * * @vma: vma that holds the pte pointing to page * @addr: address the old @page is mapped at - * @page: the cowed page we are replacing by kpage - * @kpage: the modified page we replace page by + * @old_page: the page we are replacing by new_page + * @new_page: the modified page we replace page by * - * Returns 0 on success, -EFAULT on failure. + * If @new_page is NULL, only unmap @old_page. + * + * Returns 0 on success, negative error code otherwise. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; struct page_vma_mapped_walk pvmw = { - .page = old_page, + .page = compound_head(old_page), .vma = vma, .address = addr, }; @@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); - VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, - false); - if (err) - return err; + if (new_page) { + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + &memcg, false); + if (err) + return err; + } /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(old_page); @@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { - mem_cgroup_cancel_charge(new_page, memcg, false); + if (new_page) + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); + if (new_page) { + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); + } else + /* no new page, just dec_mm_counter for old_page */ + dec_mm_counter(mm, MM_ANONPAGES); if (!PageAnon(old_page)) { dec_mm_counter(mm, mm_counter_file(old_page)); @@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + if (new_page) + set_pte_at_notify(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) @@ -464,14 +473,18 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct page *old_page, *new_page; struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; + bool orig_page_huge = false; + unsigned int gup_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); retry: + if (is_register) + gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -479,6 +492,12 @@ retry: if (ret <= 0) goto put_old; + if (WARN(!is_register && PageCompound(old_page), + "uprobe unregister should never work on compound page\n")) { + ret = -EINVAL; + goto put_old; + } + /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); @@ -488,6 +507,10 @@ retry: ref_ctr_updated = 1; } + ret = 0; + if (!is_register && !PageAnon(old_page)) + goto put_old; + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -501,8 +524,33 @@ retry: copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (!is_register) { + struct page *orig_page; + pgoff_t index; + + VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); + + index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; + orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, + index); + + if (orig_page) { + if (PageUptodate(orig_page) && + pages_identical(new_page, orig_page)) { + /* let go new_page */ + put_page(new_page); + new_page = NULL; + + if (PageCompound(orig_page)) + orig_page_huge = true; + } + put_page(orig_page); + } + } + ret = __replace_page(vma, vaddr, old_page, new_page); - put_page(new_page); + if (new_page) + put_page(new_page); put_old: put_page(old_page); @@ -513,6 +561,10 @@ put_old: if (ret && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); + /* try collapse pmd for compound page */ + if (!ret && orig_page_huge) + collapse_pte_mapped_thp(mm, vaddr); + return ret; } @@ -1405,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { + if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } |