summaryrefslogtreecommitdiffstats
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c652
1 files changed, 529 insertions, 123 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0463c1151bae..e453589da97c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -951,9 +951,9 @@ list_update_cgroup_event(struct perf_event *event,
/*
* Because cgroup events are always per-cpu events,
- * this will always be called from the right CPU.
+ * @ctx == &cpuctx->ctx.
*/
- cpuctx = __get_cpu_context(ctx);
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
/*
* Since setting cpuctx->cgrp is conditional on the current @cgrp
@@ -979,7 +979,8 @@ list_update_cgroup_event(struct perf_event *event,
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
if (add)
- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ list_add(cpuctx_entry,
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
else
list_del(cpuctx_entry);
}
@@ -1031,7 +1032,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
{
}
-void
+static inline void
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}
@@ -1103,7 +1104,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
raw_spin_lock_init(&cpuctx->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
@@ -1121,7 +1122,7 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
if (!cpuctx->hrtimer_active) {
cpuctx->hrtimer_active = 1;
hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
@@ -1887,6 +1888,104 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->generation++;
}
+static int
+perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
+{
+ if (!has_aux(aux_event))
+ return 0;
+
+ if (!event->pmu->aux_output_match)
+ return 0;
+
+ return event->pmu->aux_output_match(aux_event);
+}
+
+static void put_event(struct perf_event *event);
+static void event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx);
+
+static void perf_put_aux_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *iter;
+
+ /*
+ * If event uses aux_event tear down the link
+ */
+ if (event->aux_event) {
+ iter = event->aux_event;
+ event->aux_event = NULL;
+ put_event(iter);
+ return;
+ }
+
+ /*
+ * If the event is an aux_event, tear down all links to
+ * it from other events.
+ */
+ for_each_sibling_event(iter, event->group_leader) {
+ if (iter->aux_event != event)
+ continue;
+
+ iter->aux_event = NULL;
+ put_event(event);
+
+ /*
+ * If it's ACTIVE, schedule it out and put it into ERROR
+ * state so that we don't try to schedule it again. Note
+ * that perf_event_enable() will clear the ERROR status.
+ */
+ event_sched_out(iter, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ }
+}
+
+static bool perf_need_aux_event(struct perf_event *event)
+{
+ return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+}
+
+static int perf_get_aux_event(struct perf_event *event,
+ struct perf_event *group_leader)
+{
+ /*
+ * Our group leader must be an aux event if we want to be
+ * an aux_output. This way, the aux event will precede its
+ * aux_output events in the group, and therefore will always
+ * schedule first.
+ */
+ if (!group_leader)
+ return 0;
+
+ /*
+ * aux_output and aux_sample_size are mutually exclusive.
+ */
+ if (event->attr.aux_output && event->attr.aux_sample_size)
+ return 0;
+
+ if (event->attr.aux_output &&
+ !perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
+ return 0;
+
+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
+ return 0;
+
+ /*
+ * Link aux_outputs to their aux event; this is undone in
+ * perf_group_detach() by perf_put_aux_event(). When the
+ * group in torn down, the aux_output events loose their
+ * link to the aux_event and can't schedule any more.
+ */
+ event->aux_event = group_leader;
+
+ return 1;
+}
+
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
@@ -1902,6 +2001,8 @@ static void perf_group_detach(struct perf_event *event)
event->attach_state &= ~PERF_ATTACH_GROUP;
+ perf_put_aux_event(event);
+
/*
* If this is a sibling, remove it from its group.
*/
@@ -2154,7 +2255,7 @@ static void __perf_event_disable(struct perf_event *event,
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
- * remains valid. This condition is satisifed when called through
+ * remains valid. This condition is satisfied when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
@@ -2581,6 +2682,25 @@ perf_install_in_context(struct perf_event_context *ctx,
*/
smp_store_release(&event->ctx, ctx);
+ /*
+ * perf_event_attr::disabled events will not run and can be initialized
+ * without IPI. Except when this is the first event for the context, in
+ * that case we need the magic of the IPI to set ctx->is_active.
+ *
+ * The IOC_ENABLE that is sure to follow the creation of a disabled
+ * event will issue the IPI and reprogram the hardware.
+ */
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->task == TASK_TOMBSTONE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
return;
@@ -3119,10 +3239,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
+ struct pmu *pmu = ctx->pmu;
+
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (pmu->swap_task_ctx)
+ pmu->swap_task_ctx(ctx, next_ctx);
+ else
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3694,11 +3825,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
perf_event_groups_insert(&ctx->flexible_groups, event);
}
+/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_first_active(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_context *ctx)
{
- return list_first_entry_or_null(&ctx->flexible_active,
- struct perf_event, active_list);
+ struct perf_event *event;
+
+ /* pick the first active flexible event */
+ event = list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
+
+ /* if no active flexible event, pick the first event */
+ if (!event) {
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+ typeof(*event), group_node);
+ }
+
+ return event;
}
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3723,9 +3866,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(task_ctx);
+ task_event = ctx_event_to_rotate(task_ctx);
if (cpu_rotate)
- cpu_event = ctx_first_active(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
@@ -4089,10 +4232,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
return NULL;
__perf_event_init_context(ctx);
- if (task) {
- ctx->task = task;
- get_task_struct(task);
- }
+ if (task)
+ ctx->task = get_task_struct(task);
ctx->pmu = pmu;
return ctx;
@@ -4134,8 +4275,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EACCES);
+ err = perf_allow_cpu(&event->attr);
+ if (err)
+ return ERR_PTR(err);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -4232,7 +4374,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
+ struct perf_buffer *rb);
static void detach_sb_event(struct perf_event *event)
{
@@ -4444,6 +4586,8 @@ static void _free_event(struct perf_event *event)
unaccount_event(event);
+ security_perf_event_free(event);
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -4897,6 +5041,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct perf_event_context *ctx;
int ret;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
@@ -4907,7 +5055,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
static __poll_t perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
poll_wait(file, &event->waitq, wait);
@@ -4934,6 +5082,24 @@ static void _perf_event_reset(struct perf_event *event)
perf_event_update_userpage(event);
}
+/* Assume it's not an event with inherit set. */
+u64 perf_event_pause(struct perf_event *event, bool reset)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(event->attr.inherit);
+ _perf_event_disable(event);
+ count = local64_read(&event->count);
+ if (reset)
+ local64_set(&event->count, 0);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(perf_event_pause);
+
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
@@ -5011,16 +5177,11 @@ static int perf_event_check_period(struct perf_event *event, u64 value)
return event->pmu->check_period(event, value);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
+static int _perf_event_period(struct perf_event *event, u64 value)
{
- u64 value;
-
if (!is_sampling_event(event))
return -EINVAL;
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
-
if (!value)
return -EINVAL;
@@ -5038,6 +5199,19 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
return 0;
}
+int perf_event_period(struct perf_event *event, u64 value)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_period(event, value);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(perf_event_period);
+
static const struct file_operations perf_fops;
static inline int perf_fget_light(int fd, struct fd *p)
@@ -5081,8 +5255,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
- return perf_event_period(event, (u64 __user *)arg);
+ {
+ u64 value;
+
+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
+ return -EFAULT;
+ return _perf_event_period(event, value);
+ }
case PERF_EVENT_IOC_ID:
{
u64 id = primary_event_id(event);
@@ -5117,7 +5297,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return perf_event_set_bpf_prog(event, arg);
case PERF_EVENT_IOC_PAUSE_OUTPUT: {
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5161,6 +5341,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct perf_event_context *ctx;
long ret;
+ /* Treat ioctl like writes as it is likely a mutating operation. */
+ ret = security_perf_event_write(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
@@ -5248,7 +5433,7 @@ static void calc_timer_values(struct perf_event *event,
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5280,7 +5465,7 @@ void __weak arch_perf_update_userpage(
void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
u64 enabled, running, now;
rcu_read_lock();
@@ -5331,7 +5516,7 @@ EXPORT_SYMBOL_GPL(perf_event_update_userpage);
static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
struct perf_event *event = vmf->vma->vm_file->private_data;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
vm_fault_t ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -5364,9 +5549,9 @@ unlock:
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb)
+ struct perf_buffer *rb)
{
- struct ring_buffer *old_rb = NULL;
+ struct perf_buffer *old_rb = NULL;
unsigned long flags;
if (event->rb) {
@@ -5424,7 +5609,7 @@ static void ring_buffer_attach(struct perf_event *event,
static void ring_buffer_wakeup(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5435,9 +5620,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_unlock();
}
-struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5450,7 +5635,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
return rb;
}
-void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct perf_buffer *rb)
{
if (!refcount_dec_and_test(&rb->refcount))
return;
@@ -5488,7 +5673,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- struct ring_buffer *rb = ring_buffer_get(event);
+ struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
@@ -5512,7 +5697,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
perf_pmu_output_stop(event);
/* now it's safe to free the pages */
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
@@ -5585,7 +5770,8 @@ again:
* undo the VM accounting.
*/
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
+ &mmap_user->locked_vm);
atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
free_uid(mmap_user);
@@ -5605,8 +5791,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
+ struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
- struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
@@ -5623,6 +5809,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
vma_size = vma->vm_end - vma->vm_start;
if (vma->vm_pgoff == 0) {
@@ -5727,16 +5917,30 @@ accounting:
*/
user_lock_limit *= num_online_cpus();
- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+ user_locked = atomic_long_read(&user->locked_vm);
+ /*
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
+ */
if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += user_extra;
+
+ if (user_locked > user_lock_limit) {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
extra = user_locked - user_lock_limit;
+ user_extra -= extra;
+ }
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
- if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
@@ -5971,7 +6175,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,
* Get remaining task size from user stack pointer.
*
* It'd be better to take stack vma map and limit this more
- * precisly, but there's no way to get it safely under interrupt,
+ * precisely, but there's no way to get it safely under interrupt,
* so using TASK_SIZE as limit.
*/
static u64 perf_ustack_task_size(struct pt_regs *regs)
@@ -6066,6 +6270,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
+ struct perf_sample_data *data,
+ size_t size)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
+
+ data->aux_size = 0;
+
+ if (!sampler)
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+ goto out;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ goto out;
+
+ /*
+ * If this is an NMI hit inside sampling code, don't take
+ * the sample. See also perf_aux_sample_output().
+ */
+ if (READ_ONCE(rb->aux_in_sampling)) {
+ data->aux_size = 0;
+ } else {
+ size = min_t(size_t, size, perf_aux_size(rb));
+ data->aux_size = ALIGN(size, sizeof(u64));
+ }
+ ring_buffer_put(rb);
+
+out:
+ return data->aux_size;
+}
+
+long perf_pmu_snapshot_aux(struct perf_buffer *rb,
+ struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ unsigned long flags;
+ long ret;
+
+ /*
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+ * paths. If we start calling them in NMI context, they may race with
+ * the IRQ ones, that is, for example, re-starting an event that's just
+ * been stopped, which is why we're using a separate callback that
+ * doesn't change the event state.
+ *
+ * IRQs need to be disabled to prevent IPIs from racing with us.
+ */
+ local_irq_save(flags);
+ /*
+ * Guard against NMI hits inside the critical section;
+ * see also perf_prepare_sample_aux().
+ */
+ WRITE_ONCE(rb->aux_in_sampling, 1);
+ barrier();
+
+ ret = event->pmu->snapshot_aux(event, handle, size);
+
+ barrier();
+ WRITE_ONCE(rb->aux_in_sampling, 0);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
+ unsigned long pad;
+ long size;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
+ return;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ return;
+
+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
+
+ /*
+ * An error here means that perf_output_copy() failed (returned a
+ * non-zero surplus that it didn't copy), which in its current
+ * enlightened implementation is not possible. If that changes, we'd
+ * like to know.
+ */
+ if (WARN_ON_ONCE(size < 0))
+ goto out_put;
+
+ /*
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
+ * perf_prepare_sample_aux(), so should not be more than that.
+ */
+ pad = data->aux_size - size;
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
+ pad = 8;
+
+ if (pad) {
+ u64 zero = 0;
+ perf_output_copy(handle, &zero, pad);
+ }
+
+out_put:
+ ring_buffer_put(rb);
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -6385,11 +6705,18 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux_size);
+
+ if (data->aux_size)
+ perf_aux_sample_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
if (wakeup_events) {
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
int events = local_inc_return(&rb->events);
if (events >= wakeup_events) {
@@ -6533,7 +6860,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_STACK_USER) {
/*
- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
* in case new sample type is added, because we could eat
* up the rest of the sample size.
@@ -6573,6 +6900,35 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ u64 size;
+
+ header->size += sizeof(u64); /* size */
+
+ /*
+ * Given the 16bit nature of header::size, an AUX sample can
+ * easily overflow it, what with all the preceding sample bits.
+ * Make sure this doesn't happen by using up to U16_MAX bytes
+ * per sample in total (rounded down to 8 byte boundary).
+ */
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ size = perf_prepare_sample_aux(event, data, size);
+
+ WARN_ON_ONCE(size + header->size > U16_MAX);
+ header->size += size;
+ }
+ /*
+ * If you're adding more sample types here, you likely need to do
+ * something about the overflowing header::size, like repurpose the
+ * lowest 3 bits of size, which should be always zero at the moment.
+ * This raises a more important question, do we really need 512k sized
+ * samples and why, so good argumentation is in order for whatever you
+ * do here next.
+ */
+ WARN_ON_ONCE(header->size & 7);
}
static __always_inline int
@@ -6803,7 +7159,7 @@ void perf_event_exec(void)
}
struct remote_output {
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
int err;
};
@@ -6811,7 +7167,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
{
struct perf_event *parent = event->parent;
struct remote_output *ro = data;
- struct ring_buffer *rb = ro->rb;
+ struct perf_buffer *rb = ro->rb;
struct stop_event_data sd = {
.event = event,
};
@@ -6839,7 +7195,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->pmu;
+ struct pmu *pmu = event->ctx->pmu;
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
struct remote_output ro = {
.rb = event->rb,
@@ -7148,7 +7504,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
{
struct path ns_path;
struct inode *ns_inode;
- void *error;
+ int error;
error = ns_get_path(&ns_path, task, ns_ops);
if (!error) {
@@ -9491,7 +9847,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
period = max_t(u64, 10000, hwc->sample_period);
}
hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -9513,7 +9869,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hwc->hrtimer.function = perf_swevent_hrtimer;
/*
@@ -9924,7 +10280,7 @@ static struct lock_class_key cpuctx_lock;
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
- int cpu, ret;
+ int cpu, ret, max = PERF_TYPE_MAX;
mutex_lock(&pmus_lock);
ret = -ENOMEM;
@@ -9937,12 +10293,17 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto skip_type;
pmu->name = name;
- if (type < 0) {
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
- if (type < 0) {
- ret = type;
+ if (type != PERF_TYPE_SOFTWARE) {
+ if (type >= 0)
+ max = type;
+
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ if (ret < 0)
goto free_pdc;
- }
+
+ WARN_ON(type >= 0 && ret != type);
+
+ type = ret;
}
pmu->type = type;
@@ -10019,7 +10380,16 @@ got_cpu_context:
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
- list_add_rcu(&pmu->entry, &pmus);
+ /*
+ * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
+ * since these cannot be in the IDR. This way the linear search
+ * is fast, provided a valid software event is provided.
+ */
+ if (type == PERF_TYPE_SOFTWARE || !name)
+ list_add_rcu(&pmu->entry, &pmus);
+ else
+ list_add_tail_rcu(&pmu->entry, &pmus);
+
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -10032,7 +10402,7 @@ free_dev:
put_device(pmu->dev);
free_idr:
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
free_pdc:
@@ -10054,7 +10424,7 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
if (pmu_bus_running) {
if (pmu->nr_addr_filters)
@@ -10124,9 +10494,8 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
+ int idx, type, ret;
struct pmu *pmu;
- int idx;
- int ret;
idx = srcu_read_lock(&pmus_srcu);
@@ -10138,17 +10507,32 @@ static struct pmu *perf_init_event(struct perf_event *event)
goto unlock;
}
+ /*
+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * are often aliases for PERF_TYPE_RAW.
+ */
+ type = event->attr.type;
+ if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
+ type = PERF_TYPE_RAW;
+
+again:
rcu_read_lock();
- pmu = idr_find(&pmu_idr, event->attr.type);
+ pmu = idr_find(&pmu_idr, type);
rcu_read_unlock();
if (pmu) {
ret = perf_try_init_event(pmu, event);
+ if (ret == -ENOENT && event->attr.type != type) {
+ type = event->attr.type;
+ goto again;
+ }
+
if (ret)
pmu = ERR_PTR(ret);
+
goto unlock;
}
- list_for_each_entry_rcu(pmu, &pmus, entry) {
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
@@ -10355,8 +10739,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
- get_task_struct(task);
- event->hw.target = task;
+ event->hw.target = get_task_struct(task);
}
event->clock = &local_clock;
@@ -10368,12 +10751,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
if (overflow_handler == bpf_overflow_handler) {
- struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+ struct bpf_prog *prog = parent_event->prog;
- if (IS_ERR(prog)) {
- err = PTR_ERR(prog);
- goto err_ns;
- }
+ bpf_prog_inc(prog);
event->prog = prog;
event->orig_overflow_handler =
parent_event->orig_overflow_handler;
@@ -10426,6 +10806,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_ns;
}
+ /*
+ * Disallow uncore-cgroup events, they don't make sense as the cgroup will
+ * be different on other CPUs in the uncore mask.
+ */
+ if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ err = -EINVAL;
+ goto err_pmu;
+ }
+
+ if (event->attr.aux_output &&
+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ err = -EOPNOTSUPP;
+ goto err_pmu;
+ }
+
err = exclusive_event_init(event);
if (err)
goto err_pmu;
@@ -10465,11 +10860,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
+ err = security_perf_event_alloc(event);
+ if (err)
+ goto err_callchain_buffer;
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
+err_callchain_buffer:
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
err_addr_filters:
kfree(event->addr_filter_ranges);
@@ -10498,58 +10902,29 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
u32 size;
int ret;
- if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
- return -EFAULT;
-
- /*
- * zero the full structure, so that a short copy will be nice.
- */
+ /* Zero the full structure, so that a short copy will be nice. */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
- if (size > PAGE_SIZE) /* silly large */
- goto err_size;
-
- if (!size) /* abi compat */
+ /* ABI compatibility quirk: */
+ if (!size)
size = PERF_ATTR_SIZE_VER0;
-
- if (size < PERF_ATTR_SIZE_VER0)
+ if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
}
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
-
attr->size = size;
- if (attr->__reserved_1)
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -10587,9 +10962,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->branch_sample_type = mask;
}
/* privileged levels capture (kernel, hv): check permissions */
- if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
+ ret = perf_allow_kernel(attr);
+ if (ret)
+ return ret;
+ }
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -10630,7 +11007,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL;
+ struct perf_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -10802,13 +11179,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ if (err)
+ return err;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
}
if (attr.namespaces) {
@@ -10825,9 +11208,18 @@ SYSCALL_DEFINE5(perf_event_open,
}
/* Only privileged users can get physical addresses */
- if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
- perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
+ }
+
+ err = security_locked_down(LOCKDOWN_PERF);
+ if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
+ /* REGS_INTR can leak data, lockdown must prevent this */
+ return err;
+
+ err = 0;
/*
* In cgroup mode, the pid argument is used to pass the fd
@@ -11082,6 +11474,10 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
+ err = -EINVAL;
+ goto err_locked;
+ }
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
@@ -11228,8 +11624,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
int err;
/*
- * Get the target context (task or percpu):
+ * Grouping is not supported for kernel events, neither is 'AUX',
+ * make sure the caller's intentions are adjusted.
*/
+ if (attr->aux_output)
+ return ERR_PTR(-EINVAL);
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
@@ -11241,6 +11640,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ /*
+ * Get the target context (task or percpu):
+ */
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
@@ -11692,7 +12094,7 @@ inherit_event(struct perf_event *parent_event,
GFP_KERNEL);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
}
@@ -11794,6 +12196,10 @@ static int inherit_group(struct perf_event *parent_event,
child, leader, child_ctx);
if (IS_ERR(child_ctr))
return PTR_ERR(child_ctr);
+
+ if (sub->aux_event == parent_event && child_ctr &&
+ !perf_get_aux_event(child_ctr, leader))
+ return -EINVAL;
}
return 0;
}
OpenPOWER on IntegriCloud