From ca037701a025334e724e5c61b3b1082940c8b981 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 19:52:12 +0100 Subject: perf, x86: Add PEBS infrastructure This patch implements support for Intel Precise Event Based Sampling, which is an alternative counter mode in which the counter triggers a hardware assist to collect information on events. The hardware assist takes a trap like snapshot of a subset of the machine registers. This data is written to the Intel Debug-Store, which can be programmed with a data threshold at which to raise a PMI. With the PEBS hardware assist being trap like, the reported IP is always one instruction after the actual instruction that triggered the event. This implements a simple PEBS model that always takes a single PEBS event at a time. This is done so that the interaction with the rest of the system is as expected (freq adjust, period randomization, lbr, callchains, etc.). It adds an ABI element: perf_event_attr::precise, which indicates that we wish to use this (constrained, but precise) mode. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <20100304140100.392111285@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 80acbf3d5de1..42307b50c787 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -203,8 +203,9 @@ struct perf_event_attr { enable_on_exec : 1, /* next exec enables */ task : 1, /* trace fork/exit */ watermark : 1, /* wakeup_watermark */ + precise : 1, /* OoO invariant counter */ - __reserved_1 : 49; + __reserved_1 : 48; union { __u32 wakeup_events; /* wakeup every n events */ -- cgit v1.2.1 From caff2befffe899e63df5cc760b7ed01cfd902685 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Mar 2010 12:02:30 +0100 Subject: perf, x86: Implement simple LBR support Implement simple suport Intel Last-Branch-Record, it supports all hardware that implements FREEZE_LBRS_ON_PMI, but does not (yet) implement the LBR config register. The Intel LBR is a FIFO of From,To addresses describing the last few branches the hardware took. This patch does not add perf interface to the LBR, but merely provides an interface for internal use. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <20100304140100.544191154@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 42307b50c787..ab4fd9ede264 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -467,6 +467,17 @@ struct perf_raw_record { void *data; }; +struct perf_branch_entry { + __u64 from; + __u64 to; + __u64 flags; +}; + +struct perf_branch_stack { + __u64 nr; + struct perf_branch_entry entries[0]; +}; + struct task_struct; /** -- cgit v1.2.1 From ef21f683a045a79b6aa86ad81e5fdfc0d5ddd250 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Mar 2010 13:12:23 +0100 Subject: perf, x86: use LBR for PEBS IP+1 fixup Use the LBR to fix up the PEBS IP+1 issue. As said, PEBS reports the next instruction, here we use the LBR to find the last branch and from that construct the actual IP. If the IP matches the LBR-TO, we use LBR-FROM, otherwise we use the LBR-TO address as the beginning of the last basic block and decode forward. Once we find a match to the current IP, we use the previous location. This patch introduces a new ABI element: PERF_RECORD_MISC_EXACT, which conveys that the reported IP (PERF_SAMPLE_IP) is the exact instruction that caused the event (barring CPU errata). The fixup can fail due to various reasons: 1) LBR contains invalid data (quite possible) 2) part of the basic block got paged out 3) the reported IP isn't part of the basic block (see 1) Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: "Zhang, Yanmin" Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <20100304140100.619375431@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ab4fd9ede264..be85f7c4a94f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -294,6 +294,12 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_USER (2 << 0) #define PERF_RECORD_MISC_HYPERVISOR (3 << 0) +#define PERF_RECORD_MISC_EXACT (1 << 14) +/* + * Reserve the last bit to indicate some extended misc field + */ +#define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) + struct perf_event_header { __u32 type; __u16 misc; -- cgit v1.2.1 From faa4602e47690fb11221e00f9b9697c8dc0d4b19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Mar 2010 14:51:50 +0100 Subject: x86, perf, bts, mm: Delete the never used BTS-ptrace code Support for the PMU's BTS features has been upstreamed in v2.6.32, but we still have the old and disabled ptrace-BTS, as Linus noticed it not so long ago. It's buggy: TIF_DEBUGCTLMSR is trampling all over that MSR without regard for other uses (perf) and doesn't provide the flexibility needed for perf either. Its users are ptrace-block-step and ptrace-bts, since ptrace-bts was never used and ptrace-block-step can be implemented using a much simpler approach. So axe all 3000 lines of it. That includes the *locked_memory*() APIs in mm/mlock.c as well. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Roland McGrath Cc: Oleg Nesterov Cc: Markus Metzger Cc: Steven Rostedt Cc: Andrew Morton LKML-Reference: <20100325135413.938004390@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 12 ------------ include/linux/mm.h | 4 ---- include/linux/ptrace.h | 12 ------------ include/linux/sched.h | 9 --------- 4 files changed, 37 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 01e6adea07ec..cc12b3c556b3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -504,18 +504,6 @@ extern int ftrace_dump_on_oops; #define INIT_TRACE_RECURSION #endif -#ifdef CONFIG_HW_BRANCH_TRACER - -void trace_hw_branch(u64 from, u64 to); -void trace_hw_branch_oops(void); - -#else /* CONFIG_HW_BRANCH_TRACER */ - -static inline void trace_hw_branch(u64 from, u64 to) {} -static inline void trace_hw_branch_oops(void) {} - -#endif /* CONFIG_HW_BRANCH_TRACER */ - #ifdef CONFIG_FTRACE_SYSCALLS unsigned long arch_syscall_addr(int nr); diff --git a/include/linux/mm.h b/include/linux/mm.h index e70f21beb4b4..c8442b655111 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -19,7 +19,6 @@ struct anon_vma; struct file_ra_state; struct user_struct; struct writeback_control; -struct rlimit; #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -1449,9 +1448,6 @@ int vmemmap_populate_basepages(struct page *start_page, int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); -extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, - size_t size); -extern void refund_locked_memory(struct mm_struct *mm, size_t size); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index e1fb60729979..4272521e29e9 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -345,18 +345,6 @@ static inline void user_single_step_siginfo(struct task_struct *tsk, #define arch_ptrace_stop(code, info) do { } while (0) #endif -#ifndef arch_ptrace_untrace -/* - * Do machine-specific work before untracing child. - * - * This is called for a normal detach as well as from ptrace_exit() - * when the tracing task dies. - * - * Called with write_lock(&tasklist_lock) held. - */ -#define arch_ptrace_untrace(task) do { } while (0) -#endif - extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/include/linux/sched.h b/include/linux/sched.h index dad7f668ebf7..e0447c64af6a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,7 +99,6 @@ struct futex_pi_state; struct robust_list_head; struct bio_list; struct fs_struct; -struct bts_context; struct perf_event_context; /* @@ -1272,12 +1271,6 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; - /* - * This is the tracer handle for the ptrace BTS extension. - * This field actually belongs to the ptracer task. - */ - struct bts_context *bts; - /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; @@ -2123,10 +2116,8 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP -extern void wait_task_context_switch(struct task_struct *p); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else -static inline void wait_task_context_switch(struct task_struct *p) {} static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { -- cgit v1.2.1 From 6cc8a7c1d8560c042f486b23318a6291569ab96b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 19 Mar 2010 01:23:53 +0100 Subject: perf: Fetch hot regs from the template caller Trace events can be defined from a template using DECLARE_EVENT_CLASS/DEFINE_EVENT or directly with TRACE_EVENT. In both cases we have a template tracepoint handler, used to record the trace, to which we pass our ftrace event instance. In the function level, if the class is named "foo" and the event is named "blah", we have the following chain of calls: perf_trace_blah() -> perf_trace_templ_foo() In the case we have several events sharing the class "blah", we'll have multiple users of perf_trace_templ_foo(), and it won't be inlined by the compiler. This is usually what happens with the DECLARE_EVENT_CLASS/DEFINE_EVENT based definition. But if perf_trace_blah() is the only caller of perf_trace_templ_foo() there are fair chances that it will be inlined. The problem is that we fetch the regs from perf_trace_templ_foo() after we rewinded the frame pointer to the second caller, we want to reach the caller of perf_trace_blah() to get the right source of the event. And we do this by always assuming that perf_trace_templ_foo() is not inlined. But as shown above this is not always true. And if it is inlined we miss the first caller, losing the most important level of precision. We get: 61.31% ls [kernel.kallsyms] [k] do_softirq | --- do_softirq irq_exit do_IRQ common_interrupt | |--25.00%-- tty_buffer_request_room Instead of: 61.31% ls [kernel.kallsyms] [k] __do_softirq | --- __do_softirq do_softirq irq_exit do_IRQ common_interrupt | |--25.00%-- tty_buffer_request_room To fix this, we fetch the regs from perf_trace_blah() rather than perf_trace_templ_foo() so that we don't have to deal with inlining surprises. That also bring us the advantage of having the true source of the event even if we don't have frame pointers. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- include/trace/ftrace.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index ea6f9d4a20e9..882c64832ffe 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -758,13 +758,12 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ perf_trace_templ_##call(struct ftrace_event_call *event_call, \ - proto) \ + struct pt_regs *__regs, proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ftrace_raw_##call *entry; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ - struct pt_regs *__regs; \ int __entry_size; \ int __data_size; \ int rctx; \ @@ -785,20 +784,22 @@ perf_trace_templ_##call(struct ftrace_event_call *event_call, \ \ { assign; } \ \ - __regs = &__get_cpu_var(perf_trace_regs); \ - perf_fetch_caller_regs(__regs, 2); \ - \ perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ __count, irq_flags, __regs); \ } #undef DEFINE_EVENT -#define DEFINE_EVENT(template, call, proto, args) \ -static notrace void perf_trace_##call(proto) \ -{ \ - struct ftrace_event_call *event_call = &event_##call; \ - \ - perf_trace_templ_##template(event_call, args); \ +#define DEFINE_EVENT(template, call, proto, args) \ +static notrace void perf_trace_##call(proto) \ +{ \ + struct ftrace_event_call *event_call = &event_##call; \ + struct pt_regs *__regs = &get_cpu_var(perf_trace_regs); \ + \ + perf_fetch_caller_regs(__regs, 1); \ + \ + perf_trace_templ_##template(event_call, __regs, args); \ + \ + put_cpu_var(perf_trace_regs); \ } #undef DEFINE_EVENT_PRINT -- cgit v1.2.1 From 76e1d9047e4edefb8ada20aa90d5762306082bd6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 5 Apr 2010 15:35:57 +0200 Subject: perf: Store active software events in a hashlist Each time a software event triggers, we need to walk through the entire list of events from the current cpu and task contexts to retrieve a running perf event that matches. We also need to check a matching perf event is actually counting. This walk is wasteful and makes the event fast path scaling down with a growing number of events running on the same contexts. To solve this, we store the running perf events in a hashlist to get an immediate access to them against their type:event_id when they trigger. v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic maths along the way) - Only allocate hlist for online cpus, but keep track of the refcount on offline possible cpus too, so that we allocate it if needed when it becomes online. - Drop the kref use as it's not adapted to our tricks anymore. v3: - Fix bad refcount check (address instead of value). Thanks to Eric Dumazet who spotted this. - While exiting cpu, move the hlist release out of the IPI path to lock the hlist mutex sanely. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- include/linux/perf_event.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6e96cc8225d4..bf896d0b2e9c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -589,6 +589,14 @@ enum perf_group_flag { PERF_GROUP_SOFTWARE = 0x1, }; +#define SWEVENT_HLIST_BITS 8 +#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) + +struct swevent_hlist { + struct hlist_head heads[SWEVENT_HLIST_SIZE]; + struct rcu_head rcu_head; +}; + /** * struct perf_event - performance event kernel representation: */ @@ -597,6 +605,7 @@ struct perf_event { struct list_head group_entry; struct list_head event_entry; struct list_head sibling_list; + struct hlist_node hlist_entry; int nr_siblings; int group_flags; struct perf_event *group_leader; @@ -744,6 +753,9 @@ struct perf_cpu_context { int active_oncpu; int max_pertask; int exclusive; + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; /* * Recursion avoidance: -- cgit v1.2.1 From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 19 Apr 2010 13:32:41 +0800 Subject: perf: Enhance perf to allow for guest statistic collection from host Below patch introduces perf_guest_info_callbacks and related register/unregister functions. Add more PERF_RECORD_MISC_XXX bits meaning guest kernel and guest user space. Signed-off-by: Zhang Yanmin Signed-off-by: Avi Kivity --- include/linux/perf_event.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bf896d0b2e9c..24de5f181a41 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -288,11 +288,13 @@ struct perf_event_mmap_page { __u64 data_tail; /* user-space written tail */ }; -#define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0) +#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) #define PERF_RECORD_MISC_USER (2 << 0) #define PERF_RECORD_MISC_HYPERVISOR (3 << 0) +#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) +#define PERF_RECORD_MISC_GUEST_USER (5 << 0) #define PERF_RECORD_MISC_EXACT (1 << 14) /* @@ -446,6 +448,12 @@ enum perf_callchain_context { # include #endif +struct perf_guest_info_callbacks { + int (*is_in_guest) (void); + int (*is_user_mode) (void); + unsigned long (*get_guest_ip) (void); +}; + #ifdef CONFIG_HAVE_HW_BREAKPOINT #include #endif @@ -932,6 +940,12 @@ static inline void perf_event_mmap(struct vm_area_struct *vma) __perf_event_mmap(vma); } +extern struct perf_guest_info_callbacks *perf_guest_cbs; +extern int perf_register_guest_info_callbacks( + struct perf_guest_info_callbacks *); +extern int perf_unregister_guest_info_callbacks( + struct perf_guest_info_callbacks *); + extern void perf_event_comm(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); @@ -1001,6 +1015,11 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, static inline void perf_bp_event(struct perf_event *event, void *data) { } +static inline int perf_register_guest_info_callbacks +(struct perf_guest_info_callbacks *) {return 0; } +static inline int perf_unregister_guest_info_callbacks +(struct perf_guest_info_callbacks *) {return 0; } + static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } -- cgit v1.2.1 From dcf46b9443ad48a227a61713adea001228925adf Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Tue, 20 Apr 2010 10:13:58 +0800 Subject: perf & kvm: Clean up some of the guest profiling callback API details Fix some build bug and programming style issues: - use valid C - fix up various style details Signed-off-by: Zhang Yanmin Cc: Avi Kivity Cc: Peter Zijlstra Cc: Sheng Yang Cc: Marcelo Tosatti Cc: oerg Roedel Cc: Jes Sorensen Cc: Gleb Natapov Cc: Zachary Amsden Cc: zhiteng.huang@intel.com Cc: tim.c.chen@intel.com Cc: Arnaldo Carvalho de Melo LKML-Reference: <1271729638.2078.624.camel@ymzhang.sh.intel.com> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24de5f181a41..ace31fbac513 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -941,10 +941,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma) } extern struct perf_guest_info_callbacks *perf_guest_cbs; -extern int perf_register_guest_info_callbacks( - struct perf_guest_info_callbacks *); -extern int perf_unregister_guest_info_callbacks( - struct perf_guest_info_callbacks *); +extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern void perf_event_comm(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); @@ -1016,9 +1014,9 @@ static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *) {return 0; } +(struct perf_guest_info_callbacks *callbacks) { return 0; } static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *) {return 0; } +(struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } -- cgit v1.2.1 From 73266fc1df2f94cf72b3beba3eee3b88ed0b0664 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Apr 2010 05:05:45 +0200 Subject: hw-breakpoints: Tag ptrace breakpoint as exclude_kernel Tag ptrace breakpoints with the exclude_kernel attribute set. This will make it easier to set generic policies on breakpoints, when it comes to ensure nobody unpriviliged try to breakpoint on the kernel. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar --- include/linux/hw_breakpoint.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index c70d27af03f9..a0aa5a9cfb0e 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -34,6 +34,12 @@ static inline void hw_breakpoint_init(struct perf_event_attr *attr) attr->sample_period = 1; } +static inline void ptrace_breakpoint_init(struct perf_event_attr *attr) +{ + hw_breakpoint_init(attr); + attr->exclude_kernel = 1; +} + static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) { return bp->attr.bp_addr; -- cgit v1.2.1 From 0102752e4c9e0655b39734550d4c35327954f7f9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 11 Apr 2010 18:55:56 +0200 Subject: hw-breakpoints: Separate constraint space for data and instruction breakpoints There are two outstanding fashions for archs to implement hardware breakpoints. The first is to separate breakpoint address pattern definition space between data and instruction breakpoints. We then have typically distinct instruction address breakpoint registers and data address breakpoint registers, delivered with separate control registers for data and instruction breakpoints as well. This is the case of PowerPc and ARM for example. The second consists in having merged breakpoint address space definition between data and instruction breakpoint. Address registers can host either instruction or data address and the access mode for the breakpoint is defined in a control register. This is the case of x86 and Super H. This patch adds a new CONFIG_HAVE_MIXED_BREAKPOINTS_REGS config that archs can select if they belong to the second case. Those will have their slot allocation merged for instructions and data breakpoints. The others will have a separate slot tracking between data and instruction breakpoints. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar --- include/linux/hw_breakpoint.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index a0aa5a9cfb0e..7e8899093098 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -9,9 +9,12 @@ enum { }; enum { - HW_BREAKPOINT_R = 1, - HW_BREAKPOINT_W = 2, - HW_BREAKPOINT_X = 4, + HW_BREAKPOINT_EMPTY = 0, + HW_BREAKPOINT_R = 1, + HW_BREAKPOINT_W = 2, + HW_BREAKPOINT_RW = HW_BREAKPOINT_R | HW_BREAKPOINT_W, + HW_BREAKPOINT_X = 4, + HW_BREAKPOINT_INVALID = HW_BREAKPOINT_RW | HW_BREAKPOINT_X, }; #ifdef __KERNEL__ -- cgit v1.2.1 From feef47d0cb530e8419dfa0b48141b538b89b1b1a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 23 Apr 2010 05:59:55 +0200 Subject: hw-breakpoints: Get the number of available registers on boot dynamically The breakpoint generic layer assumes that archs always know in advance the static number of address registers available to host breakpoints through the HBP_NUM macro. However this is not true for every archs. For example Arm needs to get this information dynamically to handle the compatiblity between different versions. To solve this, this patch proposes to drop the static HBP_NUM macro and let the arch provide the number of available slots through a new hw_breakpoint_slots() function. For archs that have CONFIG_HAVE_MIXED_BREAKPOINTS_REGS selected, it will be called once as the number of registers fits for instruction and data breakpoints together. For the others it will be called first to get the number of instruction breakpoint registers and another time to get the data breakpoint registers, the targeted type is given as a parameter of hw_breakpoint_slots(). Reported-by: Will Deacon Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jason Wessel Cc: Ingo Molnar --- include/linux/hw_breakpoint.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 7e8899093098..a2d6ea49ec56 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -17,6 +17,16 @@ enum { HW_BREAKPOINT_INVALID = HW_BREAKPOINT_RW | HW_BREAKPOINT_X, }; +enum bp_type_idx { + TYPE_INST = 0, +#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS + TYPE_DATA = 0, +#else + TYPE_DATA = 1, +#endif + TYPE_MAX +}; + #ifdef __KERNEL__ #include -- cgit v1.2.1 From 4fd38e4595e2f6c9d27732c042a0e16b2753049c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2010 17:31:38 +0200 Subject: perf: Fix exit() vs PERF_FORMAT_GROUP Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't work as expected if the task the counters were attached to quit before the read() call. The cause is that we unconditionally destroy the grouping when we remove counters from their context. Fix this by only doing this when we free the counter itself. Reported-by: Corey Ashford Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1273160566.5605.404.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c8e375440403..bf8f3c003297 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -522,6 +522,7 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { + PERF_EVENT_STATE_FREE = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, -- cgit v1.2.1 From ab608344bcbde4f55ec4cd911b686b0ce3eae076 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Apr 2010 23:03:20 +0200 Subject: perf, x86: Improve the PEBS ABI Rename perf_event_attr::precise to perf_event_attr::precise_ip and widen it to 2 bits. This new field describes the required precision of the PERF_SAMPLE_IP field: 0 - SAMPLE_IP can have arbitrary skid 1 - SAMPLE_IP must have constant skid 2 - SAMPLE_IP requested to have 0 skid 3 - SAMPLE_IP must have 0 skid And modify the Intel PEBS code accordingly. The PEBS implementation now supports up to precise_ip == 2, where we perform the IP fixup. Also s/PERF_RECORD_MISC_EXACT/&_IP/ to clarify its meaning, this bit should be set for each PERF_SAMPLE_IP field known to match the actual instruction triggering the event. This new scheme allows for a PEBS mode that uses the buffer for more than a single event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6be4a0f9137c..23cd0057a681 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -203,9 +203,19 @@ struct perf_event_attr { enable_on_exec : 1, /* next exec enables */ task : 1, /* trace fork/exit */ watermark : 1, /* wakeup_watermark */ - precise : 1, /* OoO invariant counter */ - - __reserved_1 : 48; + /* + * precise_ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + precise_ip : 2, /* skid constraint */ + + __reserved_1 : 47; union { __u32 wakeup_events; /* wakeup every n events */ @@ -296,7 +306,12 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) #define PERF_RECORD_MISC_GUEST_USER (5 << 0) -#define PERF_RECORD_MISC_EXACT (1 << 14) +/* + * Indicates that the content of PERF_SAMPLE_IP points to + * the actual instruction that triggered the event. See also + * perf_event_attr::precise_ip. + */ +#define PERF_RECORD_MISC_EXACT_IP (1 << 14) /* * Reserve the last bit to indicate some extended misc field */ -- cgit v1.2.1 From 6bde9b6ce0127e2a56228a2071536d422be31336 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 23 Apr 2010 13:56:00 +0800 Subject: perf: Add group scheduling transactional APIs Add group scheduling transactional APIs to struct pmu. These APIs will be implemented in arch code, based on Peter's idea as below. > the idea behind hw_perf_group_sched_in() is to not perform > schedulability tests on each event in the group, but to add the group > as a whole and then perform one test. > > Of course, when that test fails, you'll have to roll-back the whole > group again. > > So start_txn (or a better name) would simply toggle a flag in the pmu > implementation that will make pmu::enable() not perform the > schedulablilty test. > > Then commit_txn() will perform the schedulability test (so note the > method has to have a !void return value. > > This will allow us to use the regular > kernel/perf_event.c::group_sched_in() and all the rollback code. > Currently each hw_perf_group_sched_in() implementation duplicates all > the rolllback code (with various bugs). ->start_txn: Start group events scheduling transaction, set a flag to make pmu::enable() not perform the schedulability test, it will be performed at commit time. ->commit_txn: Commit group events scheduling transaction, perform the group schedulability as a whole ->cancel_txn: Stop group events scheduling transaction, clear the flag so pmu::enable() will perform the schedulability test. Reviewed-by: Stephane Eranian Reviewed-by: Frederic Weisbecker Signed-off-by: Lin Ming Cc: David Miller Cc: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1272002160.5707.60.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 23cd0057a681..4924c96d7e2d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -547,6 +547,8 @@ struct hw_perf_event { struct perf_event; +#define PERF_EVENT_TXN_STARTED 1 + /** * struct pmu - generic performance monitoring unit */ @@ -557,6 +559,16 @@ struct pmu { void (*stop) (struct perf_event *event); void (*read) (struct perf_event *event); void (*unthrottle) (struct perf_event *event); + + /* + * group events scheduling is treated as a transaction, + * add group events as a whole and perform one schedulability test. + * If test fails, roll back the whole group + */ + + void (*start_txn) (const struct pmu *pmu); + void (*cancel_txn) (const struct pmu *pmu); + int (*commit_txn) (const struct pmu *pmu); }; /** @@ -823,9 +835,6 @@ extern void perf_disable(void); extern void perf_enable(void); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); -extern int hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * -- cgit v1.2.1 From 883a2a3189dae9d2912c417e47152f51cb922a3f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 May 2010 06:16:11 +0200 Subject: tracing: Drop lock_acquired waittime field Drop the waittime field from the lock_acquired event, we can calculate it by substracting the lock_acquired event timestamp with the matching lock_acquire one. It is not needed and takes useless space in the traces. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Hitoshi Mitake Cc: Steven Rostedt --- include/trace/events/lock.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h index 5c1dcfc16c60..17ca287ae176 100644 --- a/include/trace/events/lock.h +++ b/include/trace/events/lock.h @@ -78,24 +78,21 @@ TRACE_EVENT(lock_contended, ); TRACE_EVENT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip, waittime), + TP_ARGS(lock, ip), TP_STRUCT__entry( __string(name, lock->name) - __field(s64, wait_nsec) __field(void *, lockdep_addr) ), TP_fast_assign( __assign_str(name, lock->name); - __entry->wait_nsec = waittime; __entry->lockdep_addr = lock; ), - TP_printk("%p %s (%llu ns)", __entry->lockdep_addr, - __get_str(name), - __entry->wait_nsec) + TP_printk("%p %s", __entry->lockdep_addr, + __get_str(name)) ); #endif -- cgit v1.2.1 From 93135439459920c4d856f4ab8f068c030085c8df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 May 2010 06:24:25 +0200 Subject: tracing: Drop the nested field from lock_release event Drop the nested field as we don't use it. Every nested state can be computed from a state machine on post processing already. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Hitoshi Mitake Cc: Steven Rostedt --- include/trace/events/lock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h index 17ca287ae176..fde4c3853391 100644 --- a/include/trace/events/lock.h +++ b/include/trace/events/lock.h @@ -37,9 +37,9 @@ TRACE_EVENT(lock_acquire, TRACE_EVENT(lock_release, - TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, nested, ip), + TP_ARGS(lock, ip), TP_STRUCT__entry( __string(name, lock->name) -- cgit v1.2.1 From 2c193c736803ceb547daec725e5c5d992d039f20 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 May 2010 06:36:02 +0200 Subject: tracing: Factorize lock events in a lock class lock_acquired, lock_contended and lock_release now share the same prototype and format. Let's factorize them into a lock event class. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Hitoshi Mitake Cc: Steven Rostedt --- include/trace/events/lock.h | 48 ++++++++++++++------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h index fde4c3853391..2821b86de63b 100644 --- a/include/trace/events/lock.h +++ b/include/trace/events/lock.h @@ -35,15 +35,15 @@ TRACE_EVENT(lock_acquire, __get_str(name)) ); -TRACE_EVENT(lock_release, +DECLARE_EVENT_CLASS(lock, TP_PROTO(struct lockdep_map *lock, unsigned long ip), TP_ARGS(lock, ip), TP_STRUCT__entry( - __string(name, lock->name) - __field(void *, lockdep_addr) + __string( name, lock->name ) + __field( void *, lockdep_addr ) ), TP_fast_assign( @@ -51,48 +51,30 @@ TRACE_EVENT(lock_release, __entry->lockdep_addr = lock; ), - TP_printk("%p %s", - __entry->lockdep_addr, __get_str(name)) + TP_printk("%p %s", __entry->lockdep_addr, __get_str(name)) ); -#ifdef CONFIG_LOCK_STAT - -TRACE_EVENT(lock_contended, +DEFINE_EVENT(lock, lock_release, TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - - TP_STRUCT__entry( - __string(name, lock->name) - __field(void *, lockdep_addr) - ), + TP_ARGS(lock, ip) +); - TP_fast_assign( - __assign_str(name, lock->name); - __entry->lockdep_addr = lock; - ), +#ifdef CONFIG_LOCK_STAT - TP_printk("%p %s", - __entry->lockdep_addr, __get_str(name)) -); +DEFINE_EVENT(lock, lock_contended, -TRACE_EVENT(lock_acquired, TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), + TP_ARGS(lock, ip) +); - TP_STRUCT__entry( - __string(name, lock->name) - __field(void *, lockdep_addr) - ), +DEFINE_EVENT(lock, lock_acquired, - TP_fast_assign( - __assign_str(name, lock->name); - __entry->lockdep_addr = lock; - ), - TP_printk("%p %s", __entry->lockdep_addr, - __get_str(name)) + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip) ); #endif -- cgit v1.2.1 From e3174cfd2a1e28fff774681f00a0eef3d31da970 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 May 2010 08:31:49 +0200 Subject: Revert "perf: Fix exit() vs PERF_FORMAT_GROUP" This reverts commit 4fd38e4595e2f6c9d27732c042a0e16b2753049c. It causes various crashes and hangs when events are activated. The cause is not fully understood yet but we need to revert it because the effects are severe. Reported-by: Stephane Eranian Reported-by: Lin Ming Cc: Peter Zijlstra Cc: Corey Ashford Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4924c96d7e2d..3fd5c82e0e18 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -575,7 +575,6 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { - PERF_EVENT_STATE_FREE = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, -- cgit v1.2.1