diff options
Diffstat (limited to 'kernel')
72 files changed, 5384 insertions, 2979 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b72d1a74be0..0b5ff083fa22 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,8 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o range.o -obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o + async.o range.o jump_label.o obj-y += groups.o ifdef CONFIG_FUNCTION_TRACER @@ -23,6 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_perf_event.o = -pg +CFLAGS_REMOVE_irq_work.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -86,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_TINY_RCU) += rcutiny.o +obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -100,6 +101,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f6140..291ba3d04bea 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,7 +138,7 @@ struct css_id { * is called after synchronize_rcu(). But for safe use, css_is_removed() * css_tryget() should be used for avoiding race. */ - struct cgroup_subsys_state *css; + struct cgroup_subsys_state __rcu *css; /* * ID of this css. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b23c0979bbe7..51b143e2a07a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk); if (ret) return ret; if (threadgroup) { @@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c, 0, NULL); + ret = security_task_setscheduler(c); if (ret) { rcu_read_unlock(); return ret; diff --git a/kernel/early_res.c b/kernel/early_res.c deleted file mode 100644 index 7bfae887f211..000000000000 --- a/kernel/early_res.c +++ /dev/null @@ -1,590 +0,0 @@ -/* - * early_res, could be used to replace bootmem - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/mm.h> -#include <linux/early_res.h> -#include <linux/slab.h> -#include <linux/kmemleak.h> - -/* - * Early reserved memory areas. - */ -/* - * need to make sure this one is bigger enough before - * find_fw_memmap_area could be used - */ -#define MAX_EARLY_RES_X 32 - -struct early_res { - u64 start, end; - char name[15]; - char overlap_ok; -}; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; - -static int max_early_res __initdata = MAX_EARLY_RES_X; -static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < max_early_res && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; - early_res_count--; -} - -static void __init drop_range_partial(int i, u64 start, u64 end) -{ - u64 common_start, common_end; - u64 old_start, old_end; - - old_start = early_res[i].start; - old_end = early_res[i].end; - common_start = max(old_start, start); - common_end = min(old_end, end); - - /* no overlap ? */ - if (common_start >= common_end) - return; - - if (old_start < common_start) { - /* make head segment */ - early_res[i].end = common_start; - if (old_end > common_end) { - char name[15]; - - /* - * Save a local copy of the name, since the - * early_res array could get resized inside - * reserve_early_without_check() -> - * __check_and_double_early_res(), which would - * make the current name pointer invalid. - */ - strncpy(name, early_res[i].name, - sizeof(early_res[i].name) - 1); - /* add another for left over on tail */ - reserve_early_without_check(common_end, old_end, name); - } - return; - } else { - if (old_end > common_end) { - /* reuse the entry for tail left */ - early_res[i].start = common_end; - return; - } - /* all covered */ - drop_range(i); - } -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. - */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) -{ - int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[15]; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) - continue; - - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= max_early_res) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name ? name : "", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) -{ - u64 start, end, size, mem; - struct early_res *new; - - /* do we have enough slots left ? */ - if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) - return; - - /* double it */ - mem = -1ULL; - size = sizeof(struct early_res) * max_early_res * 2; - if (early_res == early_res_x) - start = 0; - else - start = early_res[0].end; - end = ex_start; - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - if (mem == -1ULL) { - start = ex_end; - end = get_max_mapped(); - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - } - if (mem == -1ULL) - panic("can not find more space for early_res array"); - - new = __va(mem); - /* save the first one for own */ - new[0].start = mem; - new[0].end = mem + size; - new[0].overlap_ok = 0; - /* copy old to new */ - if (early_res == early_res_x) { - memcpy(&new[1], &early_res[0], - sizeof(struct early_res) * max_early_res); - memset(&new[max_early_res+1], 0, - sizeof(struct early_res) * (max_early_res - 1)); - early_res_count++; - } else { - memcpy(&new[1], &early_res[1], - sizeof(struct early_res) * (max_early_res - 1)); - memset(&new[max_early_res], 0, - sizeof(struct early_res) * max_early_res); - } - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = new; - max_early_res *= 2; - printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", - max_early_res, mem, mem + size - 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init reserve_early_without_check(u64 start, u64 end, char *name) -{ - struct early_res *r; - - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - r = &early_res[early_res_count]; - - r->start = start; - r->end = end; - r->overlap_ok = 0; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - kmemleak_free_part(__va(start), end - start); - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= max_early_res || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -void __init free_early_partial(u64 start, u64 end) -{ - struct early_res *r; - int i; - - kmemleak_free_part(__va(start), end - start); - - if (start == end) - return; - - if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) - return; - -try_next: - i = find_overlapped_early(start, end); - if (i >= max_early_res) - return; - - r = &early_res[i]; - /* hole ? */ - if (r->end >= end && r->start <= start) { - drop_range_partial(i, start, end); - return; - } - - drop_range_partial(i, start, end); - goto try_next; -} - -#ifdef CONFIG_NO_BOOTMEM -static void __init subtract_early_res(struct range *range, int az) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - -#define DEBUG_PRINT_EARLY_RES 1 - -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO "Subtract (%d early reservations)\n", count); -#endif - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, - r->start, r->end, r->name); -#endif - final_start = PFN_DOWN(r->start); - final_end = PFN_UP(r->end); - if (final_start >= final_end) - continue; - subtract_range(range, az, final_start, final_end); - } - -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - int i, count; - u64 start = 0, end; - u64 size; - u64 mem; - struct range *range; - int nr_range; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - count *= 2; - - size = sizeof(struct range) * count; - end = get_max_mapped(); -#ifdef MAX_DMA32_PFN - if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) - start = MAX_DMA32_PFN << PAGE_SHIFT; -#endif - mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); - if (mem == -1ULL) - panic("can not find more space for range free"); - - range = __va(mem); - /* use early_node_map[] and early_res to get range array at first */ - memset(range, 0, size); - nr_range = 0; - - /* need to go over early_node_map to find out good range for node */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); -#ifdef CONFIG_X86_32 - subtract_range(range, count, max_low_pfn, -1ULL); -#endif - subtract_early_res(range, count); - nr_range = clean_sort_range(range, count); - - /* need to clear it ? */ - if (nodeid == MAX_NUMNODES) { - memset(&early_res[0], 0, - sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - } - - *rangep = range; - return nr_range; -} -#else -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - - printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count - idx, max_early_res, start, end); - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } - /* clear them */ - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - early_res_count = 0; -} -#endif - -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < max_early_res && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) -{ - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < max_early_res && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - * only with the area.between start to end is active range from early_node_map - * so they are good as RAM - */ -u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - goto out; - if (last > end) - goto out; - - return addr; - -out: - return -1ULL; -} - -u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, - u64 *sizep, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - goto out; - - return addr; - -out: - return -1ULL; -} diff --git a/kernel/exit.c b/kernel/exit.c index 03120229db28..e2bdf37f9fde 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_EVENTS - WARN_ON_ONCE(tsk->perf_event_ctxp); -#endif + perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526d..a118bf160e0b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -91,6 +91,7 @@ struct futex_pi_state { /** * struct futex_q - The hashed futex queue entry, one per waiting task + * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock * @key: the key the futex is hashed on @@ -104,7 +105,7 @@ struct futex_pi_state { * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. - * The order of wakup is always to make the first condition true, then + * The order of wakeup is always to make the first condition true, then * the second. * * PI futexes are typically woken before they are removed from the hash list via @@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key) * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * - * We have no generic implementation of a non destructive write to the + * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. @@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ pi_state = this->pi_state; /* - * Userspace might have messed up non PI and PI futexes + * Userspace might have messed up non-PI and PI futexes */ if (unlikely(!pi_state)) return -EINVAL; @@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q) /* * We set q->lock_ptr = NULL _before_ we wake up the task. If - * a non futex wake up happens on another CPU then the task - * might exit and p would dereference a non existing task + * a non-futex wake up happens on another CPU then the task + * might exit and p would dereference a non-existing task * struct. Prevent this by holding a reference on p across the * wake up. */ @@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 - * uaddr1: source futex user address - * uaddr2: target futex user address - * nr_wake: number of waiters to wake (must be 1 for requeue_pi) - * nr_requeue: number of waiters to requeue (0-INT_MAX) - * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * @uaddr1: source futex user address + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire @@ -1360,10 +1363,10 @@ out: /* The key must be already stored in q->key. */ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) + __acquires(&hb->lock) { struct futex_hash_bucket *hb; - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) { spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); } /** @@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) * an example). */ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) { int prio; @@ -1471,6 +1475,7 @@ retry: * and dropped here. */ static void unqueue_me_pi(struct futex_q *q) + __releases(q->lock_ptr) { WARN_ON(plist_node_empty(&q->list)); plist_del(&q->list, &q->list.plist); @@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q) q->pi_state = NULL; spin_unlock(q->lock_ptr); - - drop_futex_key_refs(&q->key); } /* @@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, } retry: - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, holds hb lock and increments + * q.key refs. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out; @@ -1822,28 +1828,27 @@ retry: /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; + /* unqueue_me() drops q.key ref */ if (!unqueue_me(&q)) - goto out_put_key; + goto out; ret = -ETIMEDOUT; if (to && !to->task) - goto out_put_key; + goto out; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ - if (!signal_pending(current)) { - put_futex_key(fshared, &q.key); + if (!signal_pending(current)) goto retry; - } ret = -ERESTARTSYS; if (!abs_time) - goto out_put_key; + goto out; restart = ¤t_thread_info()->restart_block; restart->fn = futex_wait_restart; - restart->futex.uaddr = (u32 *)uaddr; + restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; @@ -1856,8 +1861,6 @@ retry: ret = -ERESTART_RESTARTBLOCK; -out_put_key: - put_futex_key(fshared, &q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -1869,7 +1872,7 @@ out: static long futex_wait_restart(struct restart_block *restart) { - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + u32 __user *uaddr = restart->futex.uaddr; int fshared = 0; ktime_t t, *tp = NULL; @@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, increments q.key (key1) ref + * count. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out_key2; @@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * In order for us to be here, we know our q.key == key2, and since * we took the hb->lock above, we also know that futex_requeue() has * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquition by the requeue code. + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. */ /* Check if the requeue code acquired the second futex for us. */ @@ -2458,7 +2466,7 @@ retry: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, - int *pi) + unsigned int *pi) { unsigned long uentry; @@ -2647,7 +2655,7 @@ static int __init futex_init(void) * of the complex code paths. Also we want to prevent * registration of robust lists in that case. NULL is * guaranteed to fault and we get -EFAULT on functional - * implementation, the non functional ones will return + * implementation, the non-functional ones will return * -ENOSYS. */ curval = cmpxchg_futex_value_locked(NULL, 0, 0); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2395e5..06da4dfc339b 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -19,7 +19,7 @@ */ static inline int fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, int *pi) + compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac2..53ead174da2f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - __debug_show_held_locks(t); + debug_show_held_locks(t); touch_nmi_watchdog(); @@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order - * exit the grace period. For classic RCU, a reschedule is required. + * to exit the grace period. For classic RCU, a reschedule is required. */ static void rcu_lock_break(struct task_struct *g, struct task_struct *t) { diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c7c2aed9e2dc..2c9120f0afca 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) { - struct perf_event_context *ctx = bp->ctx; + struct task_struct *tsk = bp->hw.bp_target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->ctx == ctx && find_slot_idx(iter) == type) + if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) count += hw_breakpoint_weight(iter); } @@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; if (cpu >= 0) { slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); @@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; /* Pinned counter cpu profiling */ if (!tsk) { @@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), - triggered); + return perf_event_create_kernel_counter(attr, -1, tsk, triggered); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); @@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); *pevent = bp; @@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { .priority = 0x7fffffff }; +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static int hw_breakpoint_event_init(struct perf_event *bp) +{ + int err; + + if (bp->attr.type != PERF_TYPE_BREAKPOINT) + return -ENOENT; + + err = register_perf_hw_breakpoint(bp); + if (err) + return err; + + bp->destroy = bp_perf_event_destroy; + + return 0; +} + +static int hw_breakpoint_add(struct perf_event *bp, int flags) +{ + if (!(flags & PERF_EF_START)) + bp->hw.state = PERF_HES_STOPPED; + + return arch_install_hw_breakpoint(bp); +} + +static void hw_breakpoint_del(struct perf_event *bp, int flags) +{ + arch_uninstall_hw_breakpoint(bp); +} + +static void hw_breakpoint_start(struct perf_event *bp, int flags) +{ + bp->hw.state = 0; +} + +static void hw_breakpoint_stop(struct perf_event *bp, int flags) +{ + bp->hw.state = PERF_HES_STOPPED; +} + +static struct pmu perf_breakpoint = { + .task_ctx_nr = perf_sw_context, /* could eventually get its own */ + + .event_init = hw_breakpoint_event_init, + .add = hw_breakpoint_add, + .del = hw_breakpoint_del, + .start = hw_breakpoint_start, + .stop = hw_breakpoint_stop, + .read = hw_breakpoint_pmu_read, +}; + static int __init init_hw_breakpoint(void) { unsigned int **task_bp_pinned; @@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void) constraints_initialized = 1; + perf_pmu_register(&perf_breakpoint); + return register_die_notifier(&hw_breakpoint_exceptions_nb); err_alloc: @@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void) core_initcall(init_hw_breakpoint); -struct pmu perf_ops_bp = { - .enable = arch_install_hw_breakpoint, - .disable = arch_uninstall_hw_breakpoint, - .read = hw_breakpoint_pmu_read, -}; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig new file mode 100644 index 000000000000..31d766bf5d2e --- /dev/null +++ b/kernel/irq/Kconfig @@ -0,0 +1,53 @@ +config HAVE_GENERIC_HARDIRQS + def_bool n + +if HAVE_GENERIC_HARDIRQS +menu "IRQ subsystem" +# +# Interrupt subsystem related configuration options +# +config GENERIC_HARDIRQS + def_bool y + +config GENERIC_HARDIRQS_NO__DO_IRQ + def_bool y + +# Select this to disable the deprecated stuff +config GENERIC_HARDIRQS_NO_DEPRECATED + def_bool n + +# Options selectable by the architecture code +config HAVE_SPARSE_IRQ + def_bool n + +config GENERIC_IRQ_PROBE + def_bool n + +config GENERIC_PENDING_IRQ + def_bool n + +config AUTO_IRQ_AFFINITY + def_bool n + +config IRQ_PER_CPU + def_bool n + +config HARDIRQS_SW_RESEND + def_bool n + +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on HAVE_SPARSE_IRQ + ---help--- + + Sparse irq numbering is useful for distro kernels that want + to define a high CONFIG_NR_CPUS value but still want to have + low kernel memory footprint on smaller machines. + + ( Sparse irqs can also be beneficial on NUMA boxes, as they spread + out the interrupt descriptors in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. + +endmenu +endif diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 7d047808419d..54329cd7b3ee 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,7 +1,6 @@ -obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o +obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 2295a31ef110..505798f86c36 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -57,9 +57,10 @@ unsigned long probe_irq_on(void) * Some chips need to know about probing in * progress: */ - if (desc->chip->set_type) - desc->chip->set_type(i, IRQ_TYPE_PROBE); - desc->chip->startup(i); + if (desc->irq_data.chip->irq_set_type) + desc->irq_data.chip->irq_set_type(&desc->irq_data, + IRQ_TYPE_PROBE); + desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } @@ -76,7 +77,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->chip->startup(i)) + if (desc->irq_data.chip->irq_startup(&desc->irq_data)) desc->status |= IRQ_PENDING; } raw_spin_unlock_irq(&desc->lock); @@ -98,7 +99,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } else if (i < 32) mask |= 1 << i; @@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } @@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7091d5ca2f8..baa5c4acad83 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,108 +18,6 @@ #include "internals.h" -static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) -{ - struct irq_desc *desc; - unsigned long flags; - - desc = irq_to_desc(irq); - if (!desc) { - WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); - return; - } - - /* Ensure we don't have left over values from a previous use of this irq */ - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status = IRQ_DISABLED; - desc->chip = &no_irq_chip; - desc->handle_irq = handle_bad_irq; - desc->depth = 1; - desc->msi_desc = NULL; - desc->handler_data = NULL; - if (!keep_chip_data) - desc->chip_data = NULL; - desc->action = NULL; - desc->irq_count = 0; - desc->irqs_unhandled = 0; -#ifdef CONFIG_SMP - cpumask_setall(desc->affinity); -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_clear(desc->pending_mask); -#endif -#endif - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) -{ - dynamic_irq_init_x(irq, false); -} - -/** - * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq - * @irq: irq number to initialize - * - * does not set irq_to_desc(irq)->chip_data to NULL - */ -void dynamic_irq_init_keep_chip_data(unsigned int irq) -{ - dynamic_irq_init_x(irq, true); -} - -static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) { - WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); - return; - } - - raw_spin_lock_irqsave(&desc->lock, flags); - if (desc->action) { - raw_spin_unlock_irqrestore(&desc->lock, flags); - WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", - irq); - return; - } - desc->msi_desc = NULL; - desc->handler_data = NULL; - if (!keep_chip_data) - desc->chip_data = NULL; - desc->handle_irq = handle_bad_irq; - desc->chip = &no_irq_chip; - desc->name = NULL; - clear_kstat_irqs(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - dynamic_irq_cleanup_x(irq, false); -} - -/** - * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq - * @irq: irq number to initialize - * - * does not set irq_to_desc(irq)->chip_data to NULL - */ -void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) -{ - dynamic_irq_cleanup_x(irq, true); -} - - /** * set_irq_chip - set the irq chip for an irq * @irq: irq number @@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); - desc->chip = chip; + desc->irq_data.chip = chip; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data) } raw_spin_lock_irqsave(&desc->lock, flags); - desc->handler_data = data; + desc->irq_data.handler_data = data; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) } raw_spin_lock_irqsave(&desc->lock, flags); - desc->msi_desc = entry; + desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data) return -EINVAL; } - if (!desc->chip) { + if (!desc->irq_data.chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } raw_spin_lock_irqsave(&desc->lock, flags); - desc->chip_data = data; + desc->irq_data.chip_data = data; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(set_irq_chip_data); +struct irq_data *irq_get_irq_data(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return desc ? &desc->irq_data : NULL; +} +EXPORT_SYMBOL_GPL(irq_get_irq_data); + /** * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq * @@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread); /* * default enable function */ -static void default_enable(unsigned int irq) +static void default_enable(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } /* * default disable function */ -static void default_disable(unsigned int irq) +static void default_disable(struct irq_data *data) { } /* * default startup function */ -static unsigned int default_startup(unsigned int irq) +static unsigned int default_startup(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->enable(irq); + desc->irq_data.chip->irq_enable(data); return 0; } /* * default shutdown function */ -static void default_shutdown(unsigned int irq) +static void default_shutdown(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +/* Temporary migration helpers */ +static void compat_irq_mask(struct irq_data *data) +{ + data->chip->mask(data->irq); +} + +static void compat_irq_unmask(struct irq_data *data) +{ + data->chip->unmask(data->irq); +} + +static void compat_irq_ack(struct irq_data *data) +{ + data->chip->ack(data->irq); +} + +static void compat_irq_mask_ack(struct irq_data *data) +{ + data->chip->mask_ack(data->irq); +} + +static void compat_irq_eoi(struct irq_data *data) +{ + data->chip->eoi(data->irq); +} + +static void compat_irq_enable(struct irq_data *data) +{ + data->chip->enable(data->irq); +} + +static void compat_irq_disable(struct irq_data *data) +{ + data->chip->disable(data->irq); +} + +static void compat_irq_shutdown(struct irq_data *data) +{ + data->chip->shutdown(data->irq); +} + +static unsigned int compat_irq_startup(struct irq_data *data) +{ + return data->chip->startup(data->irq); +} + +static int compat_irq_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + return data->chip->set_affinity(data->irq, dest); +} + +static int compat_irq_set_type(struct irq_data *data, unsigned int type) +{ + return data->chip->set_type(data->irq, type); +} + +static int compat_irq_set_wake(struct irq_data *data, unsigned int on) +{ + return data->chip->set_wake(data->irq, on); +} + +static int compat_irq_retrigger(struct irq_data *data) +{ + return data->chip->retrigger(data->irq); +} + +static void compat_bus_lock(struct irq_data *data) +{ + data->chip->bus_lock(data->irq); +} + +static void compat_bus_sync_unlock(struct irq_data *data) +{ + data->chip->bus_sync_unlock(data->irq); +} +#endif + /* * Fixup enable/disable function pointers */ void irq_chip_set_defaults(struct irq_chip *chip) { - if (!chip->enable) - chip->enable = default_enable; - if (!chip->disable) - chip->disable = default_disable; - if (!chip->startup) - chip->startup = default_startup; +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED /* - * We use chip->disable, when the user provided its own. When - * we have default_disable set for chip->disable, then we need + * Compat fixup functions need to be before we set the + * defaults for enable/disable/startup/shutdown + */ + if (chip->enable) + chip->irq_enable = compat_irq_enable; + if (chip->disable) + chip->irq_disable = compat_irq_disable; + if (chip->shutdown) + chip->irq_shutdown = compat_irq_shutdown; + if (chip->startup) + chip->irq_startup = compat_irq_startup; +#endif + /* + * The real defaults + */ + if (!chip->irq_enable) + chip->irq_enable = default_enable; + if (!chip->irq_disable) + chip->irq_disable = default_disable; + if (!chip->irq_startup) + chip->irq_startup = default_startup; + /* + * We use chip->irq_disable, when the user provided its own. When + * we have default_disable set for chip->irq_disable, then we need * to use default_shutdown, otherwise the irq line is not * disabled on free_irq(): */ - if (!chip->shutdown) - chip->shutdown = chip->disable != default_disable ? - chip->disable : default_shutdown; - if (!chip->name) - chip->name = chip->typename; + if (!chip->irq_shutdown) + chip->irq_shutdown = chip->irq_disable != default_disable ? + chip->irq_disable : default_shutdown; + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED if (!chip->end) chip->end = dummy_irq_chip.end; + + /* + * Now fix up the remaining compat handlers + */ + if (chip->bus_lock) + chip->irq_bus_lock = compat_bus_lock; + if (chip->bus_sync_unlock) + chip->irq_bus_sync_unlock = compat_bus_sync_unlock; + if (chip->mask) + chip->irq_mask = compat_irq_mask; + if (chip->unmask) + chip->irq_unmask = compat_irq_unmask; + if (chip->ack) + chip->irq_ack = compat_irq_ack; + if (chip->mask_ack) + chip->irq_mask_ack = compat_irq_mask_ack; + if (chip->eoi) + chip->irq_eoi = compat_irq_eoi; + if (chip->set_affinity) + chip->irq_set_affinity = compat_irq_set_affinity; + if (chip->set_type) + chip->irq_set_type = compat_irq_set_type; + if (chip->set_wake) + chip->irq_set_wake = compat_irq_set_wake; + if (chip->retrigger) + chip->irq_retrigger = compat_irq_retrigger; +#endif } -static inline void mask_ack_irq(struct irq_desc *desc, int irq) +static inline void mask_ack_irq(struct irq_desc *desc) { - if (desc->chip->mask_ack) - desc->chip->mask_ack(irq); + if (desc->irq_data.chip->irq_mask_ack) + desc->irq_data.chip->irq_mask_ack(&desc->irq_data); else { - desc->chip->mask(irq); - if (desc->chip->ack) - desc->chip->ack(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); } desc->status |= IRQ_MASKED; } -static inline void mask_irq(struct irq_desc *desc, int irq) +static inline void mask_irq(struct irq_desc *desc) { - if (desc->chip->mask) { - desc->chip->mask(irq); + if (desc->irq_data.chip->irq_mask) { + desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; } } -static inline void unmask_irq(struct irq_desc *desc, int irq) +static inline void unmask_irq(struct irq_desc *desc) { - if (desc->chip->unmask) { - desc->chip->unmask(irq); + if (desc->irq_data.chip->irq_unmask) { + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } } @@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; raw_spin_lock(&desc->lock); - mask_ack_irq(desc, irq); + mask_ack_irq(desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) - unmask_irq(desc, irq); + unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } @@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; - mask_irq(desc, irq); + mask_irq(desc); goto out; } @@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: - desc->chip->eoi(irq); + desc->irq_data.chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } @@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc, irq); + mask_ack_irq(desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ - if (desc->chip->ack) - desc->chip->ack(irq); + desc->irq_data.chip->irq_ack(&desc->irq_data); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; if (unlikely(!action)) { - mask_irq(desc, irq); + mask_irq(desc); goto out_unlock; } @@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc, irq); + unmask_irq(desc); } desc->status &= ~IRQ_PENDING; @@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) - desc->chip->eoi(irq); + if (desc->irq_data.chip->irq_eoi) + desc->irq_data.chip->irq_eoi(&desc->irq_data); } void @@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) handle = handle_bad_irq; - else if (desc->chip == &no_irq_chip) { + else if (desc->irq_data.chip == &no_irq_chip) { printk(KERN_WARNING "Trying to install %sinterrupt handler " "for IRQ%d\n", is_chained ? "chained " : "", irq); /* @@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, * prevent us to setup the interrupt at all. Switch it to * dummy_irq_chip for easy transition. */ - desc->chip = &dummy_irq_chip; + desc->irq_data.chip = &dummy_irq_chip; } - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) - mask_ack_irq(desc, irq); + if (desc->irq_data.chip != &no_irq_chip) + mask_ack_irq(desc); desc->status |= IRQ_DISABLED; desc->depth = 1; } @@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; desc->depth = 0; - desc->chip->startup(irq); + desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, __set_irq_handler(irq, handle, 0, name); } -void set_irq_noprobe(unsigned int irq) +void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (!desc) { - printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); + if (!desc) return; - } - - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status |= IRQ_NOPROBE; - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -void set_irq_probe(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - if (!desc) { - printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); - return; - } + /* Sanitize flags */ + set &= IRQF_MODIFY_MASK; + clr &= IRQF_MODIFY_MASK; raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~IRQ_NOPROBE; + desc->status &= ~clr; + desc->status |= set; raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c new file mode 100644 index 000000000000..20dc5474947e --- /dev/null +++ b/kernel/irq/dummychip.c @@ -0,0 +1,68 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the dummy interrupt chip implementation + */ +#include <linux/interrupt.h> +#include <linux/irq.h> + +#include "internals.h" + +/* + * What should we do if we get a hw irq event on an illegal vector? + * Each architecture has to answer this themself. + */ +static void ack_bad(struct irq_data *data) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + print_irq_desc(data->irq, desc); + ack_bad_irq(data->irq); +} + +/* + * NOP functions + */ +static void noop(struct irq_data *data) { } + +static unsigned int noop_ret(struct irq_data *data) +{ + return 0; +} + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +static void compat_noop(unsigned int irq) { } +#define END_INIT .end = compat_noop +#else +#define END_INIT +#endif + +/* + * Generic no controller implementation + */ +struct irq_chip no_irq_chip = { + .name = "none", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = ack_bad, + END_INIT +}; + +/* + * Generic dummy implementation which can be used for + * real dumb interrupt sources + */ +struct irq_chip dummy_irq_chip = { + .name = "dummy", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + END_INIT +}; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27e5c6911223..e2347eb63306 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,24 +11,15 @@ */ #include <linux/irq.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/module.h> #include <linux/random.h> +#include <linux/sched.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> -#include <linux/rculist.h> -#include <linux/hash.h> -#include <linux/radix-tree.h> + #include <trace/events/irq.h> #include "internals.h" -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -struct lock_class_key irq_desc_lock_class; - /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) ack_bad_irq(irq); } -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -static void __init init_irq_default_affinity(void) -{ - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); -} -#else -static void __init init_irq_default_affinity(void) -{ -} -#endif - -/* - * Linux has a controller-independent interrupt architecture. - * Every controller has a 'controller-template', that is used - * by the main code to do the right thing. Each driver-visible - * interrupt source is transparently wired to the appropriate - * controller. Thus drivers need not be aware of the - * interrupt-controller. - * - * The code is designed to be easily extended with new/different - * interrupt controllers, without having to do assembly magic or - * having to touch the generic code. - * - * Controller mappings for all interrupt sources: - */ -int nr_irqs = NR_IRQS; -EXPORT_SYMBOL_GPL(nr_irqs); - -#ifdef CONFIG_SPARSE_IRQ - -static struct irq_desc irq_desc_init = { - .irq = -1, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -}; - -void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) -{ - void *ptr; - - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), - GFP_ATOMIC, node); - - /* - * don't overwite if can not get new one - * init_copy_kstat_irqs() could still use old one - */ - if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); - desc->kstat_irqs = ptr; - } -} - -static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) -{ - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - - raw_spin_lock_init(&desc->lock); - desc->irq = irq; -#ifdef CONFIG_SMP - desc->node = node; -#endif - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, node, nr_cpu_ids); - if (!desc->kstat_irqs) { - printk(KERN_ERR "can not alloc kstat_irqs\n"); - BUG_ON(1); - } - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); - BUG_ON(1); - } - init_desc_masks(desc); - arch_init_chip_data(desc, node); -} - -/* - * Protect the sparse_irqs: - */ -DEFINE_RAW_SPINLOCK(sparse_irq_lock); - -static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); - -static void set_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - radix_tree_insert(&irq_desc_tree, irq, desc); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return radix_tree_lookup(&irq_desc_tree, irq); -} - -void replace_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - void **ptr; - - ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); - if (ptr) - radix_tree_replace_slot(ptr, desc); -} - -static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS_LEGACY-1] = { - .irq = -1, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), - } -}; - -static unsigned int *kstat_irqs_legacy; - -int __init early_irq_init(void) -{ - struct irq_desc *desc; - int legacy_count; - int node; - int i; - - init_irq_default_affinity(); - - /* initialize nr_irqs based on nr_cpu_ids */ - arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); - - desc = irq_desc_legacy; - legacy_count = ARRAY_SIZE(irq_desc_legacy); - node = first_online_node; - - /* allocate based on nr_cpu_ids */ - kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * - sizeof(int), GFP_NOWAIT, node); - - for (i = 0; i < legacy_count; i++) { - desc[i].irq = i; -#ifdef CONFIG_SMP - desc[i].node = node; -#endif - desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; - lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - alloc_desc_masks(&desc[i], node, true); - init_desc_masks(&desc[i]); - set_irq_desc(i, &desc[i]); - } - - return arch_early_irq_init(); -} - -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= nr_irqs) { - WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", - irq, nr_irqs); - return NULL; - } - - desc = irq_to_desc(irq); - if (desc) - return desc; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - if (desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - - printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); - if (!desc) { - printk(KERN_ERR "can not alloc irq_desc\n"); - BUG_ON(1); - } - init_one_irq_desc(irq, desc, node); - - set_irq_desc(irq, desc); - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - -#else /* !CONFIG_SPARSE_IRQ */ - -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS-1] = { - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), - } -}; - -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; -int __init early_irq_init(void) -{ - struct irq_desc *desc; - int count; - int i; - - init_irq_default_affinity(); - - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); - - desc = irq_desc; - count = ARRAY_SIZE(irq_desc); - - for (i = 0; i < count; i++) { - desc[i].irq = i; - alloc_desc_masks(&desc[i], 0, true); - init_desc_masks(&desc[i]); - desc[i].kstat_irqs = kstat_irqs_all[i]; - } - return arch_early_irq_init(); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return (irq < NR_IRQS) ? irq_desc + irq : NULL; -} - -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} -#endif /* !CONFIG_SPARSE_IRQ */ - -void clear_kstat_irqs(struct irq_desc *desc) -{ - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); -} - -/* - * What should we do if we get a hw irq event on an illegal vector? - * Each architecture has to answer this themself. - */ -static void ack_bad(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - print_irq_desc(irq, desc); - ack_bad_irq(irq); -} - -/* - * NOP functions - */ -static void noop(unsigned int irq) -{ -} - -static unsigned int noop_ret(unsigned int irq) -{ - return 0; -} - -/* - * Generic no controller implementation - */ -struct irq_chip no_irq_chip = { - .name = "none", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = ack_bad, - .end = noop, -}; - -/* - * Generic dummy implementation which can be used for - * real dumb interrupt sources - */ -struct irq_chip dummy_irq_chip = { - .name = "dummy", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = noop, - .mask = noop, - .unmask = noop, - .end = noop, -}; - /* * Special, empty irq handler: */ @@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); } - desc->chip->end(irq); + desc->irq_data.chip->end(irq); return 1; } raw_spin_lock(&desc->lock); - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested @@ -530,27 +223,9 @@ out: * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ - desc->chip->end(irq); + desc->irq_data.chip->end(irq); raw_spin_unlock(&desc->lock); return 1; } #endif - -void early_init_irq_lock_class(void) -{ - struct irq_desc *desc; - int i; - - for_each_irq_desc(i, desc) { - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - } -} - -unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; -} -EXPORT_SYMBOL(kstat_irqs_cpu); - diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c63f3bc88f0b..4571ae7e085a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,9 +1,12 @@ /* * IRQ subsystem internal functions and variables: */ +#include <linux/irqdesc.h> extern int noirqdebug; +#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) + /* Set default functions for irq_chip structures: */ extern void irq_chip_set_defaults(struct irq_chip *chip); @@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -extern void clear_kstat_irqs(struct irq_desc *desc); -extern raw_spinlock_t sparse_irq_lock; -#ifdef CONFIG_SPARSE_IRQ -void replace_irq_desc(unsigned int irq, struct irq_desc *desc); -#endif +/* Resending of interrupts :*/ +void check_irq_resend(struct irq_desc *desc, unsigned int irq); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); +extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } +static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, @@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +static inline void irq_end(unsigned int irq, struct irq_desc *desc) +{ + if (desc->irq_data.chip && desc->irq_data.chip->end) + desc->irq_data.chip->end(irq); +} +#else +static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } +#endif + /* Inline functions for support of irq chips on slow busses */ -static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) +static inline void chip_bus_lock(struct irq_desc *desc) { - if (unlikely(desc->chip->bus_lock)) - desc->chip->bus_lock(irq); + if (unlikely(desc->irq_data.chip->irq_bus_lock)) + desc->irq_data.chip->irq_bus_lock(&desc->irq_data); } -static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) +static inline void chip_bus_sync_unlock(struct irq_desc *desc) { - if (unlikely(desc->chip->bus_sync_unlock)) - desc->chip->bus_sync_unlock(irq); + if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) + desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } /* @@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); printk("->handle_irq(): %p, ", desc->handle_irq); print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->chip(): %p, ", desc->chip); - print_symbol("%s\n", (unsigned long)desc->chip); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); printk("->action(): %p\n", desc->action); if (desc->action) { printk("->action->handler(): %p, ", desc->action->handler); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c new file mode 100644 index 000000000000..9d917ff72675 --- /dev/null +++ b/kernel/irq/irqdesc.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the interrupt descriptor management code + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ +#include <linux/irq.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/radix-tree.h> +#include <linux/bitmap.h> + +#include "internals.h" + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + +#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +static void __init init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpumask_setall(irq_default_affinity); +} +#else +static void __init init_irq_default_affinity(void) +{ +} +#endif + +#ifdef CONFIG_SMP +static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) +{ + if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + return -ENOMEM; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { + free_cpumask_var(desc->irq_data.affinity); + return -ENOMEM; + } +#endif + return 0; +} + +static void desc_smp_init(struct irq_desc *desc, int node) +{ + desc->irq_data.node = node; + cpumask_copy(desc->irq_data.affinity, irq_default_affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif +} + +static inline int desc_node(struct irq_desc *desc) +{ + return desc->irq_data.node; +} + +#else +static inline int +alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } +static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline int desc_node(struct irq_desc *desc) { return 0; } +#endif + +static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) +{ + desc->irq_data.irq = irq; + desc->irq_data.chip = &no_irq_chip; + desc->irq_data.chip_data = NULL; + desc->irq_data.handler_data = NULL; + desc->irq_data.msi_desc = NULL; + desc->status = IRQ_DEFAULT_INIT_FLAGS; + desc->handle_irq = handle_bad_irq; + desc->depth = 1; + desc->irq_count = 0; + desc->irqs_unhandled = 0; + desc->name = NULL; + memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + desc_smp_init(desc, node); +} + +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + +static DEFINE_MUTEX(sparse_irq_lock); +static DECLARE_BITMAP(allocated_irqs, NR_IRQS); + +#ifdef CONFIG_SPARSE_IRQ + +static RADIX_TREE(irq_desc_tree, GFP_KERNEL); + +static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) +{ + radix_tree_insert(&irq_desc_tree, irq, desc); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return radix_tree_lookup(&irq_desc_tree, irq); +} + +static void delete_irq_desc(unsigned int irq) +{ + radix_tree_delete(&irq_desc_tree, irq); +} + +#ifdef CONFIG_SMP +static void free_masks(struct irq_desc *desc) +{ +#ifdef CONFIG_GENERIC_PENDING_IRQ + free_cpumask_var(desc->pending_mask); +#endif + free_cpumask_var(desc->irq_data.affinity); +} +#else +static inline void free_masks(struct irq_desc *desc) { } +#endif + +static struct irq_desc *alloc_desc(int irq, int node) +{ + struct irq_desc *desc; + gfp_t gfp = GFP_KERNEL; + + desc = kzalloc_node(sizeof(*desc), gfp, node); + if (!desc) + return NULL; + /* allocate based on nr_cpu_ids */ + desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), + gfp, node); + if (!desc->kstat_irqs) + goto err_desc; + + if (alloc_masks(desc, gfp, node)) + goto err_kstat; + + raw_spin_lock_init(&desc->lock); + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + + desc_set_defaults(irq, desc, node); + + return desc; + +err_kstat: + kfree(desc->kstat_irqs); +err_desc: + kfree(desc); + return NULL; +} + +static void free_desc(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + unregister_irq_proc(irq, desc); + + mutex_lock(&sparse_irq_lock); + delete_irq_desc(irq); + mutex_unlock(&sparse_irq_lock); + + free_masks(desc); + kfree(desc->kstat_irqs); + kfree(desc); +} + +static int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + struct irq_desc *desc; + int i; + + for (i = 0; i < cnt; i++) { + desc = alloc_desc(start + i, node); + if (!desc) + goto err; + mutex_lock(&sparse_irq_lock); + irq_insert_desc(start + i, desc); + mutex_unlock(&sparse_irq_lock); + } + return start; + +err: + for (i--; i >= 0; i--) + free_desc(start + i); + + mutex_lock(&sparse_irq_lock); + bitmap_clear(allocated_irqs, start, cnt); + mutex_unlock(&sparse_irq_lock); + return -ENOMEM; +} + +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) +{ + int res = irq_alloc_descs(irq, irq, 1, node); + + if (res == -EEXIST || res == irq) + return irq_to_desc(irq); + return NULL; +} + +int __init early_irq_init(void) +{ + int i, initcnt, node = first_online_node; + struct irq_desc *desc; + + init_irq_default_affinity(); + + /* Let arch update nr_irqs and return the nr of preallocated irqs */ + initcnt = arch_probe_nr_irqs(); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + + for (i = 0; i < initcnt; i++) { + desc = alloc_desc(i, node); + set_bit(i, allocated_irqs); + irq_insert_desc(i, desc); + } + return arch_early_irq_init(); +} + +#else /* !CONFIG_SPARSE_IRQ */ + +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { + [0 ... NR_IRQS-1] = { + .status = IRQ_DEFAULT_INIT_FLAGS, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), + } +}; + +static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; +int __init early_irq_init(void) +{ + int count, i, node = first_online_node; + struct irq_desc *desc; + + init_irq_default_affinity(); + + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + + desc = irq_desc; + count = ARRAY_SIZE(irq_desc); + + for (i = 0; i < count; i++) { + desc[i].irq_data.irq = i; + desc[i].irq_data.chip = &no_irq_chip; + desc[i].kstat_irqs = kstat_irqs_all[i]; + alloc_masks(desc + i, GFP_KERNEL, node); + desc_smp_init(desc + i, node); + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + } + return arch_early_irq_init(); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} + +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) +{ + return irq_to_desc(irq); +} + +static void free_desc(unsigned int irq) +{ + dynamic_irq_cleanup(irq); +} + +static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + return start; +} +#endif /* !CONFIG_SPARSE_IRQ */ + +/* Dynamic interrupt handling */ + +/** + * irq_free_descs - free irq descriptors + * @from: Start of descriptor range + * @cnt: Number of consecutive irqs to free + */ +void irq_free_descs(unsigned int from, unsigned int cnt) +{ + int i; + + if (from >= nr_irqs || (from + cnt) > nr_irqs) + return; + + for (i = 0; i < cnt; i++) + free_desc(from + i); + + mutex_lock(&sparse_irq_lock); + bitmap_clear(allocated_irqs, from, cnt); + mutex_unlock(&sparse_irq_lock); +} + +/** + * irq_alloc_descs - allocate and initialize a range of irq descriptors + * @irq: Allocate for specific irq number if irq >= 0 + * @from: Start the search from this irq number + * @cnt: Number of consecutive irqs to allocate. + * @node: Preferred node on which the irq descriptor should be allocated + * + * Returns the first irq number or error code + */ +int __ref +irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) +{ + int start, ret; + + if (!cnt) + return -EINVAL; + + mutex_lock(&sparse_irq_lock); + + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + ret = -EEXIST; + if (irq >=0 && start != irq) + goto err; + + ret = -ENOMEM; + if (start >= nr_irqs) + goto err; + + bitmap_set(allocated_irqs, start, cnt); + mutex_unlock(&sparse_irq_lock); + return alloc_descs(start, cnt, node); + +err: + mutex_unlock(&sparse_irq_lock); + return ret; +} + +/** + * irq_reserve_irqs - mark irqs allocated + * @from: mark from irq number + * @cnt: number of irqs to mark + * + * Returns 0 on success or an appropriate error code + */ +int irq_reserve_irqs(unsigned int from, unsigned int cnt) +{ + unsigned int start; + int ret = 0; + + if (!cnt || (from + cnt) > nr_irqs) + return -EINVAL; + + mutex_lock(&sparse_irq_lock); + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + if (start == from) + bitmap_set(allocated_irqs, start, cnt); + else + ret = -EEXIST; + mutex_unlock(&sparse_irq_lock); + return ret; +} + +/** + * irq_get_next_irq - get next allocated irq number + * @offset: where to start the search + * + * Returns next irq number after offset or nr_irqs if none is found. + */ +unsigned int irq_get_next_irq(unsigned int offset) +{ + return find_next_bit(allocated_irqs, nr_irqs, offset); +} + +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc)); + raw_spin_unlock_irqrestore(&desc->lock, flags); +} + +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc ? desc->kstat_irqs[cpu] : 0; +} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c3003e9d91a3..644e8d5fa367 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || - !desc->chip->set_affinity) + if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || + !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc) int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; - if (!desc->chip->set_affinity) + if (!chip->irq_set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { - if (!desc->chip->set_affinity(irq, cpumask)) { - cpumask_copy(desc->affinity, cpumask); + if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { + cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } } @@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(desc->pending_mask, cpumask); } #else - if (!desc->chip->set_affinity(irq, cpumask)) { - cpumask_copy(desc->affinity, cpumask); + if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { + cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } #endif @@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(desc->affinity, cpu_online_mask) + if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); + cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); set_affinity: - desc->chip->set_affinity(irq, desc->affinity); + desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); return 0; } @@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); } } @@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) * IRQ line is re-enabled. * * This function may be called from IRQ context only when - * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! + * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { @@ -323,11 +324,11 @@ void enable_irq(unsigned int irq) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(enable_irq); @@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; - if (desc->chip->set_wake) - ret = desc->chip->set_wake(irq, on); + if (desc->irq_data.chip->irq_set_wake) + ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } @@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags) + unsigned long flags) { int ret; - struct irq_chip *chip = desc->chip; + struct irq_chip *chip = desc->irq_data.chip; - if (!chip || !chip->set_type) { + if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? @@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, } /* caller masked out all except trigger mode flags */ - ret = chip->set_type(irq, flags); + ret = chip->irq_set_type(&desc->irq_data, flags); if (ret) - pr_err("setting trigger mode %d for irq %u failed (%pF)\n", - (int)flags, irq, chip->set_type); + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); else { if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) flags |= IRQ_LEVEL; @@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; - if (chip != desc->chip) - irq_chip_set_defaults(desc->chip); + if (chip != desc->irq_data.chip) + irq_chip_set_defaults(desc->irq_data.chip); } return ret; @@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { again: - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* @@ -521,17 +522,17 @@ again: */ if (unlikely(desc->status & IRQ_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); cpu_relax(); goto again; } if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } #ifdef CONFIG_SMP @@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) } raw_spin_lock_irq(&desc->lock); - cpumask_copy(mask, desc->affinity); + cpumask_copy(mask, desc->irq_data.affinity); raw_spin_unlock_irq(&desc->lock); set_cpus_allowed_ptr(current, mask); @@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!desc) return -EINVAL; - if (desc->chip == &no_irq_chip) + if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; /* * Some drivers like serial.c use request_irq() heavily, @@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - irq_chip_set_defaults(desc->chip); + irq_chip_set_defaults(desc->irq_data.chip); init_waitqueue_head(&desc->wait_for_threads); @@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; - desc->chip->startup(irq); + desc->irq_data.chip->irq_startup(&desc->irq_data); } else /* Undo nested disables: */ desc->depth = 1; @@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->chip->release) - desc->chip->release(irq, dev_id); + if (desc->irq_data.chip->release) + desc->irq_data.chip->release(irq, dev_id); #endif /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->chip->shutdown) - desc->chip->shutdown(irq); + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); else - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); } #ifdef CONFIG_SMP @@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(free_irq); @@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); if (retval) kfree(action); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 241962280836..1d2541940480 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,6 +7,7 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip = desc->irq_data.chip; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -24,7 +25,7 @@ void move_masked_irq(int irq) if (unlikely(cpumask_empty(desc->pending_mask))) return; - if (!desc->chip->set_affinity) + if (!chip->irq_set_affinity) return; assert_raw_spin_locked(&desc->lock); @@ -43,8 +44,9 @@ void move_masked_irq(int irq) */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)) - if (!desc->chip->set_affinity(irq, desc->pending_mask)) { - cpumask_copy(desc->affinity, desc->pending_mask); + if (!chip->irq_set_affinity(&desc->irq_data, + desc->pending_mask, false)) { + cpumask_copy(desc->irq_data.affinity, desc->pending_mask); irq_set_thread_affinity(desc); } @@ -61,8 +63,8 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c deleted file mode 100644 index 65d3845665ac..000000000000 --- a/kernel/irq/numa_migrate.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * NUMA irq-desc migration code - * - * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to - * the new "home node" of the IRQ. - */ - -#include <linux/irq.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/interrupt.h> -#include <linux/kernel_stat.h> - -#include "internals.h" - -static void init_copy_kstat_irqs(struct irq_desc *old_desc, - struct irq_desc *desc, - int node, int nr) -{ - init_kstat_irqs(desc, node, nr); - - if (desc->kstat_irqs != old_desc->kstat_irqs) - memcpy(desc->kstat_irqs, old_desc->kstat_irqs, - nr * sizeof(*desc->kstat_irqs)); -} - -static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) -{ - if (old_desc->kstat_irqs == desc->kstat_irqs) - return; - - kfree(old_desc->kstat_irqs); - old_desc->kstat_irqs = NULL; -} - -static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int node) -{ - memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " - "for migration.\n", irq); - return false; - } - raw_spin_lock_init(&desc->lock); - desc->node = node; - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); - init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, node); - return true; -} - -static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) -{ - free_kstat_irqs(old_desc, desc); - free_desc_masks(old_desc, desc); - arch_free_chip_data(old_desc, desc); -} - -static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int node) -{ - struct irq_desc *desc; - unsigned int irq; - unsigned long flags; - - irq = old_desc->irq; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - - if (desc && old_desc != desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - if (!desc) { - printk(KERN_ERR "irq %d: can not get new irq_desc " - "for migration.\n", irq); - /* still use old one */ - desc = old_desc; - goto out_unlock; - } - if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { - /* still use old one */ - kfree(desc); - desc = old_desc; - goto out_unlock; - } - - replace_irq_desc(irq, desc); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - /* free the old one */ - free_one_irq_desc(old_desc, desc); - kfree(old_desc); - - return desc; - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - -struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) -{ - /* those static or target node is -1, do not move them */ - if (desc->irq < NR_IRQS_LEGACY || node == -1) - return desc; - - if (desc->node != node) - desc = __real_move_irq_desc(desc, node); - - return desc; -} - diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 09a2ee540bd2..01b1d3a88983 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->affinity; + const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) @@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); - seq_printf(m, "%d\n", desc->node); + seq_printf(m, "%d\n", desc->irq_data.node); return 0; } @@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); @@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_spurious_proc_fops, (void *)(long)irq); } +void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) +{ + char name [MAX_NAMELEN]; + + if (!root_irq_dir || !desc->dir) + return; +#ifdef CONFIG_SMP + remove_proc_entry("smp_affinity", desc->dir); + remove_proc_entry("affinity_hint", desc->dir); + remove_proc_entry("node", desc->dir); +#endif + remove_proc_entry("spurious", desc->dir); + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%u", irq); + remove_proc_entry(name, root_irq_dir); +} + #undef MAX_NAMELEN void unregister_handler_proc(unsigned int irq, struct irqaction *action) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 090c3763f3a2..891115a929aa 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) /* * Make sure the interrupt is enabled, before resending it: */ - desc->chip->enable(irq); + desc->irq_data.chip->irq_enable(&desc->irq_data); /* * We do not resend level type interrupts. Level type @@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89fb90ae534f..3089d3b9d5f3 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -14,6 +14,8 @@ #include <linux/moduleparam.h> #include <linux/timer.h> +#include "internals.h" + static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) @@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc) * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if (work && desc->chip && desc->chip->end) - desc->chip->end(irq); + if (work) + irq_end(irq, desc); raw_spin_unlock(&desc->lock); return ok; @@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 000000000000..f16763ff8481 --- /dev/null +++ b/kernel/irq_work.c @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Provides a framework for enqueueing and running callbacks from hardirq + * context. The enqueueing is NMI-safe. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/hardirq.h> + +/* + * An entry can be in one of four states: + * + * free NULL, 0 -> {claimed} : free to be used + * claimed NULL, 3 -> {pending} : claimed to be enqueued + * pending next, 3 -> {busy} : queued, pending callback + * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed + * + * We use the lower two bits of the next pointer to keep PENDING and BUSY + * flags. + */ + +#define IRQ_WORK_PENDING 1UL +#define IRQ_WORK_BUSY 2UL +#define IRQ_WORK_FLAGS 3UL + +static inline bool irq_work_is_set(struct irq_work *entry, int flags) +{ + return (unsigned long)entry->next & flags; +} + +static inline struct irq_work *irq_work_next(struct irq_work *entry) +{ + unsigned long next = (unsigned long)entry->next; + next &= ~IRQ_WORK_FLAGS; + return (struct irq_work *)next; +} + +static inline struct irq_work *next_flags(struct irq_work *entry, int flags) +{ + unsigned long next = (unsigned long)entry; + next |= flags; + return (struct irq_work *)next; +} + +static DEFINE_PER_CPU(struct irq_work *, irq_work_list); + +/* + * Claim the entry so that no one else will poke at it. + */ +static bool irq_work_claim(struct irq_work *entry) +{ + struct irq_work *next, *nflags; + + do { + next = entry->next; + if ((unsigned long)next & IRQ_WORK_PENDING) + return false; + nflags = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(&entry->next, next, nflags) != next); + + return true; +} + + +void __weak arch_irq_work_raise(void) +{ + /* + * Lame architectures will get the timer tick callback + */ +} + +/* + * Queue the entry and raise the IPI if needed. + */ +static void __irq_work_queue(struct irq_work *entry) +{ + struct irq_work **head, *next; + + head = &get_cpu_var(irq_work_list); + + do { + next = *head; + /* Can assign non-atomic because we keep the flags set. */ + entry->next = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(head, next, entry) != next); + + /* The list was empty, raise self-interrupt to start processing. */ + if (!irq_work_next(entry)) + arch_irq_work_raise(); + + put_cpu_var(irq_work_list); +} + +/* + * Enqueue the irq_work @entry, returns true on success, failure when the + * @entry was already enqueued by someone else. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue(struct irq_work *entry) +{ + if (!irq_work_claim(entry)) { + /* + * Already enqueued, can't do! + */ + return false; + } + + __irq_work_queue(entry); + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue); + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + struct irq_work *list, **head; + + head = &__get_cpu_var(irq_work_list); + if (*head == NULL) + return; + + BUG_ON(!in_irq()); + BUG_ON(!irqs_disabled()); + + list = xchg(head, NULL); + while (list != NULL) { + struct irq_work *entry = list; + + list = irq_work_next(list); + + /* + * Clear the PENDING bit, after this point the @entry + * can be re-used. + */ + entry->next = next_flags(NULL, IRQ_WORK_BUSY); + entry->func(entry); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); + } +} +EXPORT_SYMBOL_GPL(irq_work_run); + +/* + * Synchronize against the irq_work @entry, ensures the entry is not + * currently in use. + */ +void irq_work_sync(struct irq_work *entry) +{ + WARN_ON_ONCE(irqs_disabled()); + + while (irq_work_is_set(entry, IRQ_WORK_BUSY)) + cpu_relax(); +} +EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 000000000000..7be868bf25c6 --- /dev/null +++ b/kernel/jump_label.c @@ -0,0 +1,429 @@ +/* + * jump label support + * + * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> + * + */ +#include <linux/jump_label.h> +#include <linux/memory.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/err.h> + +#ifdef HAVE_JUMP_LABEL + +#define JUMP_LABEL_HASH_BITS 6 +#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) +static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; + +/* mutex to protect coming/going of the the jump_label table */ +static DEFINE_MUTEX(jump_label_mutex); + +struct jump_label_entry { + struct hlist_node hlist; + struct jump_entry *table; + int nr_entries; + /* hang modules off here */ + struct hlist_head modules; + unsigned long key; +}; + +struct jump_label_module_entry { + struct hlist_node hlist; + struct jump_entry *table; + int nr_entries; + struct module *mod; +}; + +static int jump_label_cmp(const void *a, const void *b) +{ + const struct jump_entry *jea = a; + const struct jump_entry *jeb = b; + + if (jea->key < jeb->key) + return -1; + + if (jea->key > jeb->key) + return 1; + + return 0; +} + +static void +sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) +{ + unsigned long size; + + size = (((unsigned long)stop - (unsigned long)start) + / sizeof(struct jump_entry)); + sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); +} + +static struct jump_label_entry *get_jump_label_entry(jump_label_t key) +{ + struct hlist_head *head; + struct hlist_node *node; + struct jump_label_entry *e; + u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); + + head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (key == e->key) + return e; + } + return NULL; +} + +static struct jump_label_entry * +add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) +{ + struct hlist_head *head; + struct jump_label_entry *e; + u32 hash; + + e = get_jump_label_entry(key); + if (e) + return ERR_PTR(-EEXIST); + + e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + + hash = jhash((void *)&key, sizeof(jump_label_t), 0); + head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; + e->key = key; + e->table = table; + e->nr_entries = nr_entries; + INIT_HLIST_HEAD(&(e->modules)); + hlist_add_head(&e->hlist, head); + return e; +} + +static int +build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) +{ + struct jump_entry *iter, *iter_begin; + struct jump_label_entry *entry; + int count; + + sort_jump_label_entries(start, stop); + iter = start; + while (iter < stop) { + entry = get_jump_label_entry(iter->key); + if (!entry) { + iter_begin = iter; + count = 0; + while ((iter < stop) && + (iter->key == iter_begin->key)) { + iter++; + count++; + } + entry = add_jump_label_entry(iter_begin->key, + count, iter_begin); + if (IS_ERR(entry)) + return PTR_ERR(entry); + } else { + WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); + return -1; + } + } + return 0; +} + +/*** + * jump_label_update - update jump label text + * @key - key value associated with a a jump label + * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE + * + * Will enable/disable the jump for jump label @key, depending on the + * value of @type. + * + */ + +void jump_label_update(unsigned long key, enum jump_label_type type) +{ + struct jump_entry *iter; + struct jump_label_entry *entry; + struct hlist_node *module_node; + struct jump_label_module_entry *e_module; + int count; + + mutex_lock(&jump_label_mutex); + entry = get_jump_label_entry((jump_label_t)key); + if (entry) { + count = entry->nr_entries; + iter = entry->table; + while (count--) { + if (kernel_text_address(iter->code)) + arch_jump_label_transform(iter, type); + iter++; + } + /* eanble/disable jump labels in modules */ + hlist_for_each_entry(e_module, module_node, &(entry->modules), + hlist) { + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (kernel_text_address(iter->code)) + arch_jump_label_transform(iter, type); + iter++; + } + } + } + mutex_unlock(&jump_label_mutex); +} + +static int addr_conflict(struct jump_entry *entry, void *start, void *end) +{ + if (entry->code <= (unsigned long)end && + entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +#ifdef CONFIG_MODULES + +static int module_conflict(void *start, void *end) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + struct jump_entry *iter; + int i, count; + int conflict = 0; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (addr_conflict(iter, start, end)) { + conflict = 1; + goto out; + } + iter++; + } + } + } + } +out: + return conflict; +} + +#endif + +/*** + * jump_label_text_reserved - check if addr range is reserved + * @start: start text addr + * @end: end text addr + * + * checks if the text addr located between @start and @end + * overlaps with any of the jump label patch addresses. Code + * that wants to modify kernel text should first verify that + * it does not overlap with any of the jump label addresses. + * + * returns 1 if there is an overlap, 0 otherwise + */ +int jump_label_text_reserved(void *start, void *end) +{ + struct jump_entry *iter; + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __start___jump_table; + int conflict = 0; + + mutex_lock(&jump_label_mutex); + iter = iter_start; + while (iter < iter_stop) { + if (addr_conflict(iter, start, end)) { + conflict = 1; + goto out; + } + iter++; + } + + /* now check modules */ +#ifdef CONFIG_MODULES + conflict = module_conflict(start, end); +#endif +out: + mutex_unlock(&jump_label_mutex); + return conflict; +} + +static __init int init_jump_label(void) +{ + int ret; + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + mutex_lock(&jump_label_mutex); + ret = build_jump_label_hashtable(__start___jump_table, + __stop___jump_table); + iter = iter_start; + while (iter < iter_stop) { + arch_jump_label_text_poke_early(iter->code); + iter++; + } + mutex_unlock(&jump_label_mutex); + return ret; +} +early_initcall(init_jump_label); + +#ifdef CONFIG_MODULES + +static struct jump_label_module_entry * +add_jump_label_module_entry(struct jump_label_entry *entry, + struct jump_entry *iter_begin, + int count, struct module *mod) +{ + struct jump_label_module_entry *e; + + e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + e->mod = mod; + e->nr_entries = count; + e->table = iter_begin; + hlist_add_head(&e->hlist, &entry->modules); + return e; +} + +static int add_jump_label_module(struct module *mod) +{ + struct jump_entry *iter, *iter_begin; + struct jump_label_entry *entry; + struct jump_label_module_entry *module_entry; + int count; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return 0; + + sort_jump_label_entries(mod->jump_entries, + mod->jump_entries + mod->num_jump_entries); + iter = mod->jump_entries; + while (iter < mod->jump_entries + mod->num_jump_entries) { + entry = get_jump_label_entry(iter->key); + iter_begin = iter; + count = 0; + while ((iter < mod->jump_entries + mod->num_jump_entries) && + (iter->key == iter_begin->key)) { + iter++; + count++; + } + if (!entry) { + entry = add_jump_label_entry(iter_begin->key, 0, NULL); + if (IS_ERR(entry)) + return PTR_ERR(entry); + } + module_entry = add_jump_label_module_entry(entry, iter_begin, + count, mod); + if (IS_ERR(module_entry)) + return PTR_ERR(module_entry); + } + return 0; +} + +static void remove_jump_label_module(struct module *mod) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + int i; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + if (e_module->mod == mod) { + hlist_del(&e_module->hlist); + kfree(e_module); + } + } + if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { + hlist_del(&e->hlist); + kfree(e); + } + } + } +} + +static int +jump_label_module_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + mutex_lock(&jump_label_mutex); + ret = add_jump_label_module(mod); + if (ret) + remove_jump_label_module(mod); + mutex_unlock(&jump_label_mutex); + break; + case MODULE_STATE_GOING: + mutex_lock(&jump_label_mutex); + remove_jump_label_module(mod); + mutex_unlock(&jump_label_mutex); + break; + } + return ret; +} + +/*** + * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() + * @mod: module to patch + * + * Allow for run-time selection of the optimal nops. Before the module + * loads patch these with arch_get_jump_label_nop(), which is specified by + * the arch specific jump label code. + */ +void jump_label_apply_nops(struct module *mod) +{ + struct jump_entry *iter; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + iter = mod->jump_entries; + while (iter < mod->jump_entries + mod->num_jump_entries) { + arch_jump_label_text_poke_early(iter->code); + iter++; + } +} + +struct notifier_block jump_label_module_nb = { + .notifier_call = jump_label_module_notify, + .priority = 0, +}; + +static __init int init_jump_label_module(void) +{ + return register_module_notifier(&jump_label_module_nb); +} +early_initcall(init_jump_label_module); + +#endif /* CONFIG_MODULES */ + +#endif diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae96..7c44133f51ec 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -47,6 +47,7 @@ #include <linux/memory.h> #include <linux/ftrace.h> #include <linux/cpu.h> +#include <linux/jump_label.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> @@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobes_all_disarmed; -static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +/* This protects kprobe_table and optimizing_list */ +static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -399,7 +401,7 @@ static inline int kprobe_optready(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). */ -struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) { int i; struct kprobe *p = NULL; @@ -594,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) } #ifdef CONFIG_SYSCTL +/* This should be called with kprobe_mutex locked */ static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; @@ -606,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void) return; kprobes_allow_optimization = true; - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } - mutex_unlock(&text_mutex); printk(KERN_INFO "Kprobes globally optimized\n"); } +/* This should be called with kprobe_mutex locked */ static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; @@ -831,6 +833,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) +__acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -842,6 +845,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) +__acquires(hlist_lock) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); @@ -849,6 +853,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash, void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +__releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -857,7 +862,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, spin_unlock_irqrestore(hlist_lock, *flags); } -void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_unlock(unsigned long hash, + unsigned long *flags) +__releases(hlist_lock) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); @@ -1141,7 +1148,8 @@ int __kprobes register_kprobe(struct kprobe *p) preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr)) { + ftrace_text_reserved(p->addr, p->addr) || + jump_label_text_reserved(p->addr, p->addr)) { preempt_enable(); return -EINVAL; } @@ -1339,18 +1347,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - unsigned long addr; + unsigned long addr, offset; jp = jps[i]; addr = arch_deref_entry_point(jp->entry); - if (!kernel_text_address(addr)) - ret = -EINVAL; - else { - /* Todo: Verify probepoint is a function entry point */ + /* Verify probepoint is a function entry point */ + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && + offset == 0) { jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; ret = register_kprobe(&jp->kp); - } + } else + ret = -EINVAL; + if (ret < 0) { if (i > 0) unregister_jprobes(jps, i); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f2852a510232..42ba65dff7d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } #endif + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk(KERN_ERR + "BUG: looking up invalid subclass: %u\n", subclass); + printk(KERN_ERR + "turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + /* * Static locks do not have their class-keys yet - for them the key * is the lock object itself: @@ -774,7 +784,9 @@ out_unlock_set: raw_local_irq_restore(flags); if (!subclass || force) - lock->class_cache = class; + lock->class_cache[0] = class; + else if (subclass < NR_LOCKDEP_CACHING_CLASSES) + lock->class_cache[subclass] = class; if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - lock->class_cache = NULL; + int i; + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; + #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); #endif @@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return 0; - } - if (lock->key == &__lockdep_no_validate__) check = 1; - if (!subclass) - class = lock->class_cache; + if (subclass < NR_LOCKDEP_CACHING_CLASSES) + class = lock->class_cache[subclass]; /* - * Not cached yet or subclass? + * Not cached? */ if (unlikely(!class)) { class = register_lock_class(lock, subclass, 0); @@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache; + struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { - if (unlikely(class == lock->class_cache)) { + int match = 0; + + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + match |= class == lock->class_cache[j]; + + if (unlikely(match)) { if (debug_locks_off_graph_unlock()) WARN_ON(1); goto out_restore; @@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); * Careful: only use this function if you are sure that * the task cannot run in parallel! */ -void __debug_show_held_locks(struct task_struct *task) +void debug_show_held_locks(struct task_struct *task) { if (unlikely(!debug_locks)) { printk("INFO: lockdep is turned off.\n"); @@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task) } lockdep_print_held_locks(task); } -EXPORT_SYMBOL_GPL(__debug_show_held_locks); - -void debug_show_held_locks(struct task_struct *task) -{ - __debug_show_held_locks(task); -} EXPORT_SYMBOL_GPL(debug_show_held_locks); void lockdep_sys_exit(void) diff --git a/kernel/module.c b/kernel/module.c index ccd641991842..2df46301a7a4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -55,6 +55,7 @@ #include <linux/async.h> #include <linux/percpu.h> #include <linux/kmemleak.h> +#include <linux/jump_label.h> #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -2309,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif +#ifdef HAVE_JUMP_LABEL + mod->jump_entries = section_objs(info, "__jump_table", + sizeof(*mod->jump_entries), + &mod->num_jump_entries); +#endif #ifdef CONFIG_EVENT_TRACING mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b98bed3d8182..517d827f4982 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,24 +31,18 @@ #include <linux/kernel_stat.h> #include <linux/perf_event.h> #include <linux/ftrace_event.h> -#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> -/* - * Each CPU has a list of per CPU events: - */ -static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_events __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_events __read_mostly; +atomic_t perf_task_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + /* * perf event paranoia level: * -1 - not paranoid at all @@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; static atomic64_t perf_event_id; -/* - * Lock for (sysadmin-configurable) event reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); +void __weak perf_event_print_debug(void) { } -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +extern __weak const char *perf_pmu_name(void) { - return NULL; + return "pmu"; } -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak perf_event_print_debug(void) { } - -static DEFINE_PER_CPU(int, perf_disable_count); +void perf_pmu_disable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!(*count)++) + pmu->pmu_disable(pmu); +} -void perf_disable(void) +void perf_pmu_enable(struct pmu *pmu) { - if (!__get_cpu_var(perf_disable_count)++) - hw_perf_disable(); + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!--(*count)) + pmu->pmu_enable(pmu); } -void perf_enable(void) +static DEFINE_PER_CPU(struct list_head, rotation_list); + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_pmu_rotate_start(struct pmu *pmu) { - if (!--__get_cpu_var(perf_disable_count)) - hw_perf_enable(); + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct list_head *head = &__get_cpu_var(rotation_list); + + WARN_ON(!irqs_disabled()); + + if (list_empty(&cpuctx->rotation_list)) + list_add(&cpuctx->rotation_list, head); } static void get_ctx(struct perf_event_context *ctx) @@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event) * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_event_ctxp); +retry: + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* * If this context is a clone of another, it might @@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped on us any more. */ raw_spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_event_ctxp)) { + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } @@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ -static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +static struct perf_event_context * +perf_pin_task_context(struct task_struct *task, int ctxn) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, &flags); + ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) } list_add_rcu(&event->event_entry, &ctx->event_list); + if (!ctx->nr_events) + perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader; - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) @@ -436,7 +445,7 @@ event_sched_out(struct perf_event *event, event->state = PERF_EVENT_STATE_OFF; } event->tstamp_stopped = ctx->time; - event->pmu->disable(event); + event->pmu->del(event, 0); event->oncpu = -1; if (!is_software_event(event)) @@ -466,6 +475,12 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + /* * Cross CPU call to remove a performance event * @@ -474,9 +489,9 @@ group_sched_out(struct perf_event *group_event, */ static void __perf_event_remove_from_context(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a task context, we need to check whether it is @@ -487,27 +502,11 @@ static void __perf_event_remove_from_context(void *info) return; raw_spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * events on a global level. - */ - perf_disable(); event_sched_out(event, cpuctx, ctx); list_del_event(event, ctx); - if (!ctx->task) { - /* - * Allow more per task events with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_events - ctx->nr_events, - perf_max_events - perf_reserved_percpu); - } - - perf_enable(); raw_spin_unlock(&ctx->lock); } @@ -572,8 +571,8 @@ retry: static void __perf_event_disable(void *info) { struct perf_event *event = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a per-task event, need to check whether this @@ -628,7 +627,7 @@ void perf_event_disable(struct perf_event *event) return; } - retry: +retry: task_oncpu_function_call(task, __perf_event_disable, event); raw_spin_lock_irq(&ctx->lock); @@ -667,7 +666,7 @@ event_sched_in(struct perf_event *event, */ smp_wmb(); - if (event->pmu->enable(event)) { + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; return -EAGAIN; @@ -691,22 +690,17 @@ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - const struct pmu *pmu = group_event->pmu; - bool txn = false; + struct pmu *pmu = group_event->pmu; + u64 now = ctx->time; + bool simulate = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - /* Check if group transaction availabe */ - if (pmu->start_txn) - txn = true; - - if (txn) - pmu->start_txn(pmu); + pmu->start_txn(pmu); if (event_sched_in(group_event, cpuctx, ctx)) { - if (txn) - pmu->cancel_txn(pmu); + pmu->cancel_txn(pmu); return -EAGAIN; } @@ -720,23 +714,38 @@ group_sched_in(struct perf_event *group_event, } } - if (!txn || !pmu->commit_txn(pmu)) + if (!pmu->commit_txn(pmu)) return 0; group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: + * The events up to the failed event are scheduled out normally, + * tstamp_stopped will be updated. + * + * The failed events and the remaining siblings need to have + * their timings updated as if they had gone thru event_sched_in() + * and event_sched_out(). This is required to get consistent timings + * across the group. This also takes care of the case where the group + * could never be scheduled by ensuring tstamp_stopped is set to mark + * the time the event was actually stopped, such that time delta + * calculation in update_event_times() is correct. */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) - break; - event_sched_out(event, cpuctx, ctx); + simulate = true; + + if (simulate) { + event->tstamp_running += now - event->tstamp_stopped; + event->tstamp_stopped = now; + } else { + event_sched_out(event, cpuctx, ctx); + } } event_sched_out(group_event, cpuctx, ctx); - if (txn) - pmu->cancel_txn(pmu); + pmu->cancel_txn(pmu); return -EAGAIN; } @@ -789,10 +798,10 @@ static void add_event_to_ctx(struct perf_event *event, */ static void __perf_install_in_context(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; /* @@ -812,12 +821,6 @@ static void __perf_install_in_context(void *info) ctx->is_active = 1; update_context_time(ctx); - /* - * Protect the list operation against NMI by disabling the - * events on a global level. NOP for non NMI based events. - */ - perf_disable(); - add_event_to_ctx(event, ctx); if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -855,12 +858,7 @@ static void __perf_install_in_context(void *info) } } - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - +unlock: raw_spin_unlock(&ctx->lock); } @@ -883,6 +881,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + event->ctx = ctx; + if (!task) { /* * Per cpu events are installed via an smp call and @@ -931,10 +931,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, event->state = PERF_EVENT_STATE_INACTIVE; event->tstamp_enabled = ctx->time - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) - if (sub->state >= PERF_EVENT_STATE_INACTIVE) + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { sub->tstamp_enabled = ctx->time - sub->total_time_enabled; + } + } } /* @@ -943,9 +945,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, static void __perf_event_enable(void *info) { struct perf_event *event = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; /* @@ -979,12 +981,10 @@ static void __perf_event_enable(void *info) if (!group_can_go_on(event, cpuctx, 1)) { err = -EEXIST; } else { - perf_disable(); if (event == leader) err = group_sched_in(event, cpuctx, ctx); else err = event_sched_in(event, cpuctx, ctx); - perf_enable(); } if (err) { @@ -1000,7 +1000,7 @@ static void __perf_event_enable(void *info) } } - unlock: +unlock: raw_spin_unlock(&ctx->lock); } @@ -1041,7 +1041,7 @@ void perf_event_enable(struct perf_event *event) if (event->state == PERF_EVENT_STATE_ERROR) event->state = PERF_EVENT_STATE_OFF; - retry: +retry: raw_spin_unlock_irq(&ctx->lock); task_oncpu_function_call(task, __perf_event_enable, event); @@ -1061,7 +1061,7 @@ void perf_event_enable(struct perf_event *event) if (event->state == PERF_EVENT_STATE_OFF) __perf_event_mark_enabled(event, ctx); - out: +out: raw_spin_unlock_irq(&ctx->lock); } @@ -1092,26 +1092,26 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_event *event; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); - perf_disable(); if (!ctx->nr_active) - goto out_enable; + goto out; - if (event_type & EVENT_PINNED) + if (event_type & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); + } - if (event_type & EVENT_FLEXIBLE) + if (event_type & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); - - out_enable: - perf_enable(); - out: + } +out: + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -1209,34 +1209,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -/* - * Called from scheduler to remove the events of the current task, - * with interrupts disabled. - * - * We stop each event and update the event value in event->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * not restart the event. - */ -void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) +void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; struct perf_event_context *parent; + struct perf_cpu_context *cpuctx; int do_switch = 1; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + if (likely(!ctx)) + return; - if (likely(!ctx || !cpuctx->task_ctx)) + cpuctx = __get_cpu_context(ctx); + if (!cpuctx->task_ctx) return; rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_event_ctxp; + next_ctx = next->perf_event_ctxp[ctxn]; if (parent && next_ctx && rcu_dereference(next_ctx->parent_ctx) == parent) { /* @@ -1255,8 +1246,8 @@ void perf_event_task_sched_out(struct task_struct *task, * XXX do we need a memory barrier of sorts * wrt to rcu_dereference() of perf_event_ctxp */ - task->perf_event_ctxp = next_ctx; - next->perf_event_ctxp = ctx; + task->perf_event_ctxp[ctxn] = next_ctx; + next->perf_event_ctxp[ctxn] = ctx; ctx->task = next; next_ctx->task = task; do_switch = 0; @@ -1274,10 +1265,35 @@ void perf_event_task_sched_out(struct task_struct *task, } } +#define for_each_task_context_nr(ctxn) \ + for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) +{ + int ctxn; + + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + + for_each_task_context_nr(ctxn) + perf_event_context_sched_out(task, ctxn, next); +} + static void task_ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -1292,14 +1308,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, /* * Called with IRQs disabled */ -static void __perf_event_task_sched_out(struct perf_event_context *ctx) -{ - task_ctx_sched_out(ctx, EVENT_ALL); -} - -/* - * Called with IRQs disabled - */ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type) { @@ -1350,9 +1358,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; - if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; + } } } @@ -1368,8 +1377,6 @@ ctx_sched_in(struct perf_event_context *ctx, ctx->timestamp = perf_clock(); - perf_disable(); - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1381,8 +1388,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (event_type & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); - perf_enable(); - out: +out: raw_spin_unlock(&ctx->lock); } @@ -1394,43 +1400,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, event_type); } -static void task_ctx_sched_in(struct task_struct *task, +static void task_ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_cpu_context *cpuctx; - if (likely(!ctx)) - return; + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; + ctx_sched_in(ctx, cpuctx, event_type); cpuctx->task_ctx = ctx; } -/* - * Called from scheduler to add the events of the current task - * with interrupts disabled. - * - * We restore the event value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * keep the event running. - */ -void perf_event_task_sched_in(struct task_struct *task) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; - if (likely(!ctx)) - return; +void perf_event_context_sched_in(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; - perf_disable(); - + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1444,7 +1435,37 @@ void perf_event_task_sched_in(struct task_struct *task) cpuctx->task_ctx = ctx; - perf_enable(); + /* + * Since these rotations are per-cpu, we need to ensure the + * cpu-context we got scheduled on is actually rotating. + */ + perf_pmu_rotate_start(ctx->pmu); + perf_pmu_enable(ctx->pmu); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void __perf_event_task_sched_in(struct task_struct *task) +{ + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (likely(!ctx)) + continue; + + perf_event_context_sched_in(ctx); + } } #define MAX_INTERRUPTS (~0ULL) @@ -1524,22 +1545,6 @@ do { \ return div64_u64(dividend, divisor); } -static void perf_event_stop(struct perf_event *event) -{ - if (!event->pmu->stop) - return event->pmu->disable(event); - - return event->pmu->stop(event); -} - -static int perf_event_start(struct perf_event *event) -{ - if (!event->pmu->start) - return event->pmu->enable(event); - - return event->pmu->start(event); -} - static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; @@ -1559,15 +1564,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - perf_disable(); - perf_event_stop(event); + event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); - perf_event_start(event); - perf_enable(); + event->pmu->start(event, PERF_EF_RELOAD); } } -static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) { struct perf_event *event; struct hw_perf_event *hwc; @@ -1592,23 +1595,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); - perf_disable(); - event->pmu->unthrottle(event); - perf_enable(); + event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) continue; - perf_disable(); event->pmu->read(event); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; if (delta > 0) - perf_adjust_period(event, TICK_NSEC, delta); - perf_enable(); + perf_adjust_period(event, period, delta); } raw_spin_unlock(&ctx->lock); } @@ -1626,32 +1625,38 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr) +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_rotate_context(struct perf_cpu_context *cpuctx) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - int rotate = 0; - - if (!atomic_read(&nr_events)) - return; + u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; + struct perf_event_context *ctx = NULL; + int rotate = 0, remove = 1; - cpuctx = &__get_cpu_var(perf_cpu_context); - if (cpuctx->ctx.nr_events && - cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + if (cpuctx->ctx.nr_events) { + remove = 0; + if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; + } - ctx = curr->perf_event_ctxp; - if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) - rotate = 1; + ctx = cpuctx->task_ctx; + if (ctx && ctx->nr_events) { + remove = 0; + if (ctx->nr_events != ctx->nr_active) + rotate = 1; + } - perf_ctx_adjust_freq(&cpuctx->ctx); + perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_adjust_freq(&cpuctx->ctx, interval); if (ctx) - perf_ctx_adjust_freq(ctx); + perf_ctx_adjust_freq(ctx, interval); if (!rotate) - return; + goto done; - perf_disable(); cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_out(ctx, EVENT_FLEXIBLE); @@ -1662,8 +1667,27 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_in(curr, EVENT_FLEXIBLE); - perf_enable(); + task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + +done: + if (remove) + list_del_init(&cpuctx->rotation_list); + + perf_pmu_enable(cpuctx->ctx.pmu); +} + +void perf_event_task_tick(void) +{ + struct list_head *head = &__get_cpu_var(rotation_list); + struct perf_cpu_context *cpuctx, *tmp; + + WARN_ON(!irqs_disabled()); + + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + if (cpuctx->jiffies_interval == 1 || + !(jiffies % cpuctx->jiffies_interval)) + perf_rotate_context(cpuctx); + } } static int event_enable_on_exec(struct perf_event *event, @@ -1685,20 +1709,18 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(struct task_struct *task) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx; struct perf_event *event; unsigned long flags; int enabled = 0; int ret; local_irq_save(flags); - ctx = task->perf_event_ctxp; if (!ctx || !ctx->nr_events) goto out; - __perf_event_task_sched_out(ctx); + task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -1722,8 +1744,8 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task); - out: + perf_event_context_sched_in(ctx); +out: local_irq_restore(flags); } @@ -1732,9 +1754,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) */ static void __perf_event_read(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a task context, we need to check whether it is @@ -1773,7 +1795,13 @@ static u64 perf_event_read(struct perf_event *event) unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); - update_context_time(ctx); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) + update_context_time(ctx); update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -1782,11 +1810,219 @@ static u64 perf_event_read(struct perf_event *event) } /* - * Initialize the perf_event context in a task_struct: + * Callchain support */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * + num_possible_cpus(); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + static void -__perf_event_init_context(struct perf_event_context *ctx, - struct task_struct *task) +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); @@ -1794,45 +2030,38 @@ __perf_event_init_context(struct perf_event_context *ctx, INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - ctx->task = task; } -static struct perf_event_context *find_get_context(pid_t pid, int cpu) +static struct perf_event_context * +alloc_perf_context(struct pmu *pmu, struct task_struct *task) { struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - - if (pid == -1 && cpu != -1) { - /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - if (cpu < 0 || cpu >= nr_cpumask_bits) - return ERR_PTR(-EINVAL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!ctx) + return NULL; - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); + __perf_event_init_context(ctx); + if (task) { + ctx->task = task; + get_task_struct(task); + } + ctx->pmu = pmu; - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); + return ctx; +} - return ctx; - } +static struct task_struct * +find_lively_task_by_vpid(pid_t vpid) +{ + struct task_struct *task; + int err; rcu_read_lock(); - if (!pid) + if (!vpid) task = current; else - task = find_task_by_vpid(pid); + task = find_task_by_vpid(vpid); if (task) get_task_struct(task); rcu_read_unlock(); @@ -1852,36 +2081,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto errout; - retry: - ctx = perf_lock_task_context(task, &flags); + return task; +errout: + put_task_struct(task); + return ERR_PTR(err); + +} + +static struct perf_event_context * +find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + unsigned long flags; + int ctxn, err; + + if (!task && cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu >= nr_cpumask_bits) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_online(cpu)) + return ERR_PTR(-ENODEV); + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + err = -EINVAL; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto errout; + +retry: + ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); raw_spin_unlock_irqrestore(&ctx->lock, flags); } if (!ctx) { - ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; - __perf_event_init_context(ctx, task); + get_ctx(ctx); - if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + + if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { /* * We raced with some other task; use * the context they set. */ + put_task_struct(task); kfree(ctx); goto retry; } - get_task_struct(task); } - put_task_struct(task); return ctx; - errout: - put_task_struct(task); +errout: return ERR_PTR(err); } @@ -1898,21 +2169,23 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void perf_pending_sync(struct perf_event *event); static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { - perf_pending_sync(event); + irq_work_sync(&event->pending); if (!event->parent) { - atomic_dec(&nr_events); + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_dec(&perf_task_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.task) atomic_dec(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); } if (event->buffer) { @@ -1923,7 +2196,9 @@ static void free_event(struct perf_event *event) if (event->destroy) event->destroy(event); - put_ctx(event->ctx); + if (event->ctx) + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); } @@ -2342,6 +2617,9 @@ int perf_event_task_disable(void) static int perf_event_index(struct perf_event *event) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; + if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; @@ -2845,16 +3123,7 @@ void perf_event_wakeup(struct perf_event *event) } } -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_event(struct perf_pending_entry *entry) +static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); @@ -2870,99 +3139,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) } } -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_event_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_event *event) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return event->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_event *event) -{ - wait_event(event->waitq, perf_not_pending(event)); -} - -void perf_event_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - - /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is @@ -3012,8 +3188,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) if (handle->nmi) { handle->event->pending_wakeup = 1; - perf_pending_queue(&handle->event->pending, - perf_pending_event); + irq_work_queue(&handle->event->pending); } else perf_event_wakeup(handle->event); } @@ -3069,7 +3244,7 @@ again: if (handle->wakeup != local_read(&buffer->wakeup)) perf_output_wakeup(handle); - out: +out: preempt_enable(); } @@ -3457,14 +3632,20 @@ static void perf_event_output(struct perf_event *event, int nmi, struct perf_output_handle handle; struct perf_event_header header; + /* protect the callchain buffers */ + rcu_read_lock(); + perf_prepare_sample(&header, data, event, regs); if (perf_output_begin(&handle, event, header.size, nmi, 1)) - return; + goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); + +exit: + rcu_read_unlock(); } /* @@ -3578,16 +3759,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx = task_event->task_ctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_task_ctx(&cpuctx->ctx, task_event); - if (!ctx) - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_task_ctx(ctx, task_event); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + + ctx = task_event->task_ctx; + if (!ctx) { + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + } + if (ctx) + perf_event_task_ctx(ctx, task_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); } @@ -3692,8 +3884,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - unsigned int size; char comm[TASK_COMM_LEN]; + unsigned int size; + struct pmu *pmu; + int ctxn; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); @@ -3705,21 +3899,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); } void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; + struct perf_event_context *ctx; + int ctxn; - if (task->perf_event_ctxp) - perf_event_enable_on_exec(task); + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } if (!atomic_read(&nr_comm_events)) return; @@ -3821,6 +4030,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) char tmp[16]; char *buf = NULL; const char *name; + struct pmu *pmu; + int ctxn; memset(tmp, 0, sizeof(tmp)); @@ -3873,12 +4084,23 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, + vma->vm_flags & VM_EXEC); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_event_mmap_ctx(ctx, mmap_event, + vma->vm_flags & VM_EXEC); + } +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); kfree(buf); @@ -3960,8 +4182,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, struct hw_perf_event *hwc = &event->hw; int ret = 0; - throttle = (throttle && event->pmu->unthrottle != NULL); - if (!throttle) { hwc->interrupts++; } else { @@ -4004,8 +4224,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, event->pending_kill = POLL_HUP; if (nmi) { event->pending_disable = 1; - perf_pending_queue(&event->pending, - perf_pending_event); + irq_work_queue(&event->pending); } else perf_event_disable(event); } @@ -4029,6 +4248,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, * Generic software event infrastructure */ +struct swevent_htable { + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; + + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; +}; + +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); + /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event @@ -4086,7 +4316,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } } -static void perf_swevent_add(struct perf_event *event, u64 nr, +static void perf_swevent_event(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { @@ -4112,6 +4342,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; + if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; @@ -4158,11 +4391,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) /* For the read side: events when they trigger */ static inline struct hlist_head * -find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; - hlist = rcu_dereference(ctx->swevent_hlist); + hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; @@ -4171,7 +4404,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * -find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; @@ -4182,7 +4415,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ - hlist = rcu_dereference_protected(ctx->swevent_hlist, + hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; @@ -4195,23 +4428,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx; + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct perf_event *event; struct hlist_node *node; struct hlist_head *head; - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - - head = find_swevent_head_rcu(cpuctx, type, event_id); - + head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_add(event, nr, nmi, data, regs); + perf_swevent_event(event, nr, nmi, data, regs); } end: rcu_read_unlock(); @@ -4219,33 +4448,17 @@ end: int perf_swevent_get_recursion_context(void) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (cpuctx->recursion[rctx]) - return -1; - - cpuctx->recursion[rctx]++; - barrier(); + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - return rctx; + return get_recursion_context(swhash->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void inline perf_swevent_put_recursion_context(int rctx) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - barrier(); - cpuctx->recursion[rctx]--; + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + + put_recursion_context(swhash->recursion, rctx); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, @@ -4271,20 +4484,20 @@ static void perf_swevent_read(struct perf_event *event) { } -static int perf_swevent_enable(struct perf_event *event) +static int perf_swevent_add(struct perf_event *event, int flags) { + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct hw_perf_event *hwc = &event->hw; - struct perf_cpu_context *cpuctx; struct hlist_head *head; - cpuctx = &__get_cpu_var(perf_cpu_context); - if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } - head = find_swevent_head(cpuctx, event); + hwc->state = !(flags & PERF_EF_START); + + head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; @@ -4293,202 +4506,27 @@ static int perf_swevent_enable(struct perf_event *event) return 0; } -static void perf_swevent_disable(struct perf_event *event) +static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } -static void perf_swevent_void(struct perf_event *event) +static void perf_swevent_start(struct perf_event *event, int flags) { + event->hw.state = 0; } -static int perf_swevent_int(struct perf_event *event) +static void perf_swevent_stop(struct perf_event *event, int flags) { - return 0; + event->hw.state = PERF_HES_STOPPED; } -static const struct pmu perf_ops_generic = { - .enable = perf_swevent_enable, - .disable = perf_swevent_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, - .read = perf_swevent_read, - .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ -}; - -/* - * hrtimer based swevent callback - */ - -static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_event *event; - u64 period; - - event = container_of(hrtimer, struct perf_event, hw.hrtimer); - event->pmu->read(event); - - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - regs = get_irq_regs(); - - if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, event->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -static void perf_swevent_start_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period; - - if (hwc->remaining) { - if (hwc->remaining < 0) - period = 10000; - else - period = hwc->remaining; - hwc->remaining = 0; - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } -} - -static void perf_swevent_cancel_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->sample_period) { - ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - hwc->remaining = ktime_to_ns(remaining); - - hrtimer_cancel(&hwc->hrtimer); - } -} - -/* - * Software event: cpu wall time clock - */ - -static void cpu_clock_perf_event_update(struct perf_event *event) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = local64_xchg(&event->hw.prev_count, now); - local64_add(now - prev, &event->count); -} - -static int cpu_clock_perf_event_enable(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int cpu = raw_smp_processor_id(); - - local64_set(&hwc->prev_count, cpu_clock(cpu)); - perf_swevent_start_hrtimer(event); - - return 0; -} - -static void cpu_clock_perf_event_disable(struct perf_event *event) -{ - perf_swevent_cancel_hrtimer(event); - cpu_clock_perf_event_update(event); -} - -static void cpu_clock_perf_event_read(struct perf_event *event) -{ - cpu_clock_perf_event_update(event); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_event_enable, - .disable = cpu_clock_perf_event_disable, - .read = cpu_clock_perf_event_read, -}; - -/* - * Software event: task time clock - */ - -static void task_clock_perf_event_update(struct perf_event *event, u64 now) -{ - u64 prev; - s64 delta; - - prev = local64_xchg(&event->hw.prev_count, now); - delta = now - prev; - local64_add(delta, &event->count); -} - -static int task_clock_perf_event_enable(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 now; - - now = event->ctx->time; - - local64_set(&hwc->prev_count, now); - - perf_swevent_start_hrtimer(event); - - return 0; -} - -static void task_clock_perf_event_disable(struct perf_event *event) -{ - perf_swevent_cancel_hrtimer(event); - task_clock_perf_event_update(event, event->ctx->time); - -} - -static void task_clock_perf_event_read(struct perf_event *event) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } - - task_clock_perf_event_update(event, time); -} - -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_event_enable, - .disable = task_clock_perf_event_disable, - .read = task_clock_perf_event_read, -}; - /* Deref the hlist from the update side */ static inline struct swevent_hlist * -swevent_hlist_deref(struct perf_cpu_context *cpuctx) +swevent_hlist_deref(struct swevent_htable *swhash) { - return rcu_dereference_protected(cpuctx->swevent_hlist, - lockdep_is_held(&cpuctx->hlist_mutex)); + return rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) @@ -4499,27 +4537,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) kfree(hlist); } -static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +static void swevent_hlist_release(struct swevent_htable *swhash) { - struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); + struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; - rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + rcu_assign_pointer(swhash->swevent_hlist, NULL); call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); } static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - mutex_lock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); - if (!--cpuctx->hlist_refcount) - swevent_hlist_release(cpuctx); + if (!--swhash->hlist_refcount) + swevent_hlist_release(swhash); - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(struct perf_event *event) @@ -4537,12 +4575,12 @@ static void swevent_hlist_put(struct perf_event *event) static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; - mutex_lock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -4550,11 +4588,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) err = -ENOMEM; goto exit; } - rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); } - cpuctx->hlist_refcount++; - exit: - mutex_unlock(&cpuctx->hlist_mutex); + swhash->hlist_refcount++; +exit: + mutex_unlock(&swhash->hlist_mutex); return err; } @@ -4578,7 +4616,7 @@ static int swevent_hlist_get(struct perf_event *event) put_online_cpus(); return 0; - fail: +fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; @@ -4589,17 +4627,64 @@ static int swevent_hlist_get(struct perf_event *event) return err; } -#ifdef CONFIG_EVENT_TRACING +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + jump_label_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); +} + +static int perf_swevent_init(struct perf_event *event) +{ + int event_id = event->attr.config; + + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: + return -ENOENT; + + default: + break; + } + + if (event_id > PERF_COUNT_SW_MAX) + return -ENOENT; + + if (!event->parent) { + int err; -static const struct pmu perf_ops_tracepoint = { - .enable = perf_trace_enable, - .disable = perf_trace_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, + err = swevent_hlist_get(event); + if (err) + return err; + + jump_label_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + + return 0; +} + +static struct pmu perf_swevent = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_swevent_init, + .add = perf_swevent_add, + .del = perf_swevent_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, .read = perf_swevent_read, - .unthrottle = perf_swevent_void, }; +#ifdef CONFIG_EVENT_TRACING + static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { @@ -4643,7 +4728,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) - perf_swevent_add(event, count, 1, &data, regs); + perf_swevent_event(event, count, 1, &data, regs); } perf_swevent_put_recursion_context(rctx); @@ -4655,10 +4740,13 @@ static void tp_perf_event_destroy(struct perf_event *event) perf_trace_destroy(event); } -static const struct pmu *tp_perf_event_init(struct perf_event *event) +static int perf_tp_event_init(struct perf_event *event) { int err; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4666,15 +4754,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) if ((event->attr.sample_type & PERF_SAMPLE_RAW) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); + return -EPERM; err = perf_trace_init(event); if (err) - return NULL; + return err; event->destroy = tp_perf_event_destroy; - return &perf_ops_tracepoint; + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +static inline void perf_tp_register(void) +{ + perf_pmu_register(&perf_tracepoint); } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4702,9 +4806,8 @@ static void perf_event_free_filter(struct perf_event *event) #else -static const struct pmu *tp_perf_event_init(struct perf_event *event) +static inline void perf_tp_register(void) { - return NULL; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4719,105 +4822,389 @@ static void perf_event_free_filter(struct perf_event *event) #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT -static void bp_perf_event_destroy(struct perf_event *event) +void perf_bp_event(struct perf_event *bp, void *data) { - release_bp_slot(event); + struct perf_sample_data sample; + struct pt_regs *regs = data; + + perf_sample_data_init(&sample, bp->attr.bp_addr); + + if (!bp->hw.state && !perf_exclude_event(bp, regs)) + perf_swevent_event(bp, 1, 1, &sample, regs); } +#endif -static const struct pmu *bp_perf_event_init(struct perf_event *bp) +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { - int err; + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; - err = register_perf_hw_breakpoint(bp); - if (err) - return ERR_PTR(err); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + regs = get_irq_regs(); + + if (regs && !perf_exclude_event(event, regs)) { + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } - bp->destroy = bp_perf_event_destroy; + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - return &perf_ops_bp; + return ret; } -void perf_bp_event(struct perf_event *bp, void *data) +static void perf_swevent_start_hrtimer(struct perf_event *event) { - struct perf_sample_data sample; - struct pt_regs *regs = data; + struct hw_perf_event *hwc = &event->hw; - perf_sample_data_init(&sample, bp->attr.bp_addr); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + s64 period = local64_read(&hwc->period_left); + + if (period) { + if (period < 0) + period = 10000; + + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL_PINNED, 0); + } +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; - if (!perf_exclude_event(bp, regs)) - perf_swevent_add(bp, 1, 1, &sample, regs); + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + local64_set(&hwc->period_left, ktime_to_ns(remaining)); + + hrtimer_cancel(&hwc->hrtimer); + } } -#else -static const struct pmu *bp_perf_event_init(struct perf_event *bp) + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_event_update(struct perf_event *event) { - return NULL; + s64 prev; + u64 now; + + now = local_clock(); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); } -void perf_bp_event(struct perf_event *bp, void *regs) +static void cpu_clock_event_start(struct perf_event *event, int flags) { + local64_set(&event->hw.prev_count, local_clock()); + perf_swevent_start_hrtimer(event); } -#endif -atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +static void cpu_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + cpu_clock_event_update(event); +} -static void sw_perf_event_destroy(struct perf_event *event) +static int cpu_clock_event_add(struct perf_event *event, int flags) { - u64 event_id = event->attr.config; + if (flags & PERF_EF_START) + cpu_clock_event_start(event, flags); - WARN_ON(event->parent); + return 0; +} - atomic_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); +static void cpu_clock_event_del(struct perf_event *event, int flags) +{ + cpu_clock_event_stop(event, flags); } -static const struct pmu *sw_perf_event_init(struct perf_event *event) +static void cpu_clock_event_read(struct perf_event *event) { - const struct pmu *pmu = NULL; - u64 event_id = event->attr.config; + cpu_clock_event_update(event); +} + +static int cpu_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) + return -ENOENT; + + return 0; +} + +static struct pmu perf_cpu_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = cpu_clock_event_init, + .add = cpu_clock_event_add, + .del = cpu_clock_event_del, + .start = cpu_clock_event_start, + .stop = cpu_clock_event_stop, + .read = cpu_clock_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = local64_xchg(&event->hw.prev_count, now); + delta = now - prev; + local64_add(delta, &event->count); +} + +static void task_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, event->ctx->time); + perf_swevent_start_hrtimer(event); +} + +static void task_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + task_clock_event_update(event, event->ctx->time); +} + +static int task_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + task_clock_event_start(event, flags); + + return 0; +} + +static void task_clock_event_del(struct perf_event *event, int flags) +{ + task_clock_event_stop(event, PERF_EF_UPDATE); +} + +static void task_clock_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_event_update(event, time); +} + +static int task_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) + return -ENOENT; + + return 0; +} + +static struct pmu perf_task_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = task_clock_event_init, + .add = task_clock_event_add, + .del = task_clock_event_del, + .start = task_clock_event_start, + .stop = task_clock_event_stop, + .read = task_clock_event_read, +}; + +static void perf_pmu_nop_void(struct pmu *pmu) +{ +} + +static int perf_pmu_nop_int(struct pmu *pmu) +{ + return 0; +} + +static void perf_pmu_start_txn(struct pmu *pmu) +{ + perf_pmu_disable(pmu); +} + +static int perf_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} + +static void perf_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + +/* + * Ensures all contexts with the same task_ctx_nr have the same + * pmu_cpu_context too. + */ +static void *find_pmu_context(int ctxn) +{ + struct pmu *pmu; + + if (ctxn < 0) + return NULL; + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->task_ctx_nr == ctxn) + return pmu->pmu_cpu_context; + } + + return NULL; +} + +static void free_pmu_context(void * __percpu cpu_context) +{ + struct pmu *pmu; + + mutex_lock(&pmus_lock); /* - * Software events (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. + * Like a real lame refcount. */ - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->pmu_cpu_context == cpu_context) + goto out; + } - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu event, - * use the cpu_clock event instead. - */ - if (event->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; + free_percpu(cpu_context); +out: + mutex_unlock(&pmus_lock); +} - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - case PERF_COUNT_SW_ALIGNMENT_FAULTS: - case PERF_COUNT_SW_EMULATION_FAULTS: - if (!event->parent) { - int err; - - err = swevent_hlist_get(event); - if (err) - return ERR_PTR(err); +int perf_pmu_register(struct pmu *pmu) +{ + int cpu, ret; - atomic_inc(&perf_swevent_enabled[event_id]); - event->destroy = sw_perf_event_destroy; + mutex_lock(&pmus_lock); + ret = -ENOMEM; + pmu->pmu_disable_count = alloc_percpu(int); + if (!pmu->pmu_disable_count) + goto unlock; + + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); + if (pmu->pmu_cpu_context) + goto got_cpu_context; + + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); + if (!pmu->pmu_cpu_context) + goto free_pdc; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + cpuctx->ctx.type = cpu_context; + cpuctx->ctx.pmu = pmu; + cpuctx->jiffies_interval = 1; + INIT_LIST_HEAD(&cpuctx->rotation_list); + } + +got_cpu_context: + if (!pmu->start_txn) { + if (pmu->pmu_enable) { + /* + * If we have pmu_enable/pmu_disable calls, install + * transaction stubs that use that to try and batch + * hardware accesses. + */ + pmu->start_txn = perf_pmu_start_txn; + pmu->commit_txn = perf_pmu_commit_txn; + pmu->cancel_txn = perf_pmu_cancel_txn; + } else { + pmu->start_txn = perf_pmu_nop_void; + pmu->commit_txn = perf_pmu_nop_int; + pmu->cancel_txn = perf_pmu_nop_void; + } + } + + if (!pmu->pmu_enable) { + pmu->pmu_enable = perf_pmu_nop_void; + pmu->pmu_disable = perf_pmu_nop_void; + } + + list_add_rcu(&pmu->entry, &pmus); + ret = 0; +unlock: + mutex_unlock(&pmus_lock); + + return ret; + +free_pdc: + free_percpu(pmu->pmu_disable_count); + goto unlock; +} + +void perf_pmu_unregister(struct pmu *pmu) +{ + mutex_lock(&pmus_lock); + list_del_rcu(&pmu->entry); + mutex_unlock(&pmus_lock); + + /* + * We dereference the pmu list under both SRCU and regular RCU, so + * synchronize against both of those. + */ + synchronize_srcu(&pmus_srcu); + synchronize_rcu(); + + free_percpu(pmu->pmu_disable_count); + free_pmu_context(pmu->pmu_cpu_context); +} + +struct pmu *perf_init_event(struct perf_event *event) +{ + struct pmu *pmu = NULL; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + int ret = pmu->event_init(event); + if (!ret) + goto unlock; + + if (ret != -ENOENT) { + pmu = ERR_PTR(ret); + goto unlock; } - pmu = &perf_ops_generic; - break; } + pmu = ERR_PTR(-ENOENT); +unlock: + srcu_read_unlock(&pmus_srcu, idx); return pmu; } @@ -4826,20 +5213,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) * Allocate and initialize a event structure */ static struct perf_event * -perf_event_alloc(struct perf_event_attr *attr, - int cpu, - struct perf_event_context *ctx, - struct perf_event *group_leader, - struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler, - gfp_t gfpflags) -{ - const struct pmu *pmu; +perf_event_alloc(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + struct perf_event *group_leader, + struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler) +{ + struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err; - event = kzalloc(sizeof(*event), gfpflags); + event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); @@ -4857,6 +5242,7 @@ perf_event_alloc(struct perf_event_attr *attr, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); init_waitqueue_head(&event->waitq); + init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); @@ -4864,7 +5250,6 @@ perf_event_alloc(struct perf_event_attr *attr, event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; - event->ctx = ctx; event->oncpu = -1; event->parent = parent_event; @@ -4874,6 +5259,17 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; + if (task) { + event->attach_state = PERF_ATTACH_TASK; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * hw_breakpoint is a bit difficult here.. + */ + if (attr->type == PERF_TYPE_BREAKPOINT) + event->hw.bp_target = task; +#endif + } + if (!overflow_handler && parent_event) overflow_handler = parent_event->overflow_handler; @@ -4898,29 +5294,8 @@ perf_event_alloc(struct perf_event_attr *attr, if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_event_init(event); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_event_init(event); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_event_init(event); - break; - - case PERF_TYPE_BREAKPOINT: - pmu = bp_perf_event_init(event); - break; - + pmu = perf_init_event(event); - default: - break; - } done: err = 0; if (!pmu) @@ -4938,13 +5313,21 @@ done: event->pmu = pmu; if (!event->parent) { - atomic_inc(&nr_events); + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_inc(&perf_task_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) { + free_event(event); + return ERR_PTR(err); + } + } } return event; @@ -5092,12 +5475,16 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_event *event, *group_leader = NULL, *output_event = NULL; + struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct file *group_file = NULL; + struct task_struct *task = NULL; + struct pmu *pmu; int event_fd; + int move_group = 0; int fput_needed = 0; int err; @@ -5123,20 +5510,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_fd; - } - if (group_fd != -1) { group_leader = perf_fget_light(group_fd, &fput_needed); if (IS_ERR(group_leader)) { err = PTR_ERR(group_leader); - goto err_put_context; + goto err_fd; } group_file = group_leader->filp; if (flags & PERF_FLAG_FD_OUTPUT) @@ -5145,6 +5523,58 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } + if (pid != -1) { + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } + + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_task; + } + + /* + * Special case software events and allow them to be part of + * any hardware group. + */ + pmu = event->pmu; + + if (group_leader && + (is_software_event(event) != is_software_event(group_leader))) { + if (is_software_event(event)) { + /* + * If event and group_leader are not both a software + * event, and event is, then group leader is not. + * + * Allow the addition of software events to !software + * groups, this is safe because software events never + * fail to schedule. + */ + pmu = group_leader->pmu; + } else if (is_software_event(group_leader) && + (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; + } + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_alloc; + } + /* * Look up the group leader (we will attach this event to it): */ @@ -5156,42 +5586,66 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_put_context; + goto err_context; /* * Do not allow to attach to a group in a different * task or CPU context: */ - if (group_leader->ctx != ctx) - goto err_put_context; + if (move_group) { + if (group_leader->ctx->type != ctx->type) + goto err_context; + } else { + if (group_leader->ctx != ctx) + goto err_context; + } + /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, NULL, GFP_KERNEL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_put_context; + goto err_context; } if (output_event) { err = perf_event_set_output(event, output_event); if (err) - goto err_free_put_context; + goto err_context; } event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); - goto err_free_put_context; + goto err_context; + } + + if (move_group) { + struct perf_event_context *gctx = group_leader->ctx; + + mutex_lock(&gctx->mutex); + perf_event_remove_from_context(group_leader); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_event_remove_from_context(sibling); + put_ctx(gctx); + } + mutex_unlock(&gctx->mutex); + put_ctx(gctx); } event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + + if (move_group) { + perf_install_in_context(ctx, group_leader, cpu); + get_ctx(ctx); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_install_in_context(ctx, sibling, cpu); + get_ctx(ctx); + } + } + perf_install_in_context(ctx, event, cpu); ++ctx->generation; mutex_unlock(&ctx->mutex); @@ -5212,11 +5666,15 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; -err_free_put_context: +err_context: + put_ctx(ctx); +err_alloc: free_event(event); -err_put_context: +err_task: + if (task) + put_task_struct(task); +err_group_fd: fput_light(group_file, fput_needed); - put_ctx(ctx); err_fd: put_unused_fd(event_fd); return err; @@ -5227,32 +5685,31 @@ err_fd: * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound - * @pid: task to profile + * @task: task to profile (NULL for percpu) */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, + struct task_struct *task, perf_overflow_handler_t overflow_handler) { - struct perf_event *event; struct perf_event_context *ctx; + struct perf_event *event; int err; /* * Get the target context (task or percpu): */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_exit; - } - - event = perf_event_alloc(attr, cpu, ctx, NULL, - NULL, overflow_handler, GFP_KERNEL); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_put_context; + goto err; + } + + ctx = find_get_context(event->pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_free; } event->filp = NULL; @@ -5270,112 +5727,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; - err_put_context: - put_ctx(ctx); - err_exit: +err_free: + free_event(event); +err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -/* - * inherit a event from parent task to child task: - */ -static struct perf_event * -inherit_event(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event *group_leader, - struct perf_event_context *child_ctx) -{ - struct perf_event *child_event; - - /* - * Instead of creating recursive hierarchies of events, - * we link inherited events back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_event->parent) - parent_event = parent_event->parent; - - child_event = perf_event_alloc(&parent_event->attr, - parent_event->cpu, child_ctx, - group_leader, parent_event, - NULL, GFP_KERNEL); - if (IS_ERR(child_event)) - return child_event; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent event, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_event_{en, dis}able_family. - */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) - child_event->state = PERF_EVENT_STATE_INACTIVE; - else - child_event->state = PERF_EVENT_STATE_OFF; - - if (parent_event->attr.freq) { - u64 sample_period = parent_event->hw.sample_period; - struct hw_perf_event *hwc = &child_event->hw; - - hwc->sample_period = sample_period; - hwc->last_period = sample_period; - - local64_set(&hwc->period_left, sample_period); - } - - child_event->overflow_handler = parent_event->overflow_handler; - - /* - * Link it up in the child's context: - */ - add_event_to_ctx(child_event, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* - * Link this into the parent event's child list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_add_tail(&child_event->child_list, &parent_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - return child_event; -} - -static int inherit_group(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event_context *child_ctx) -{ - struct perf_event *leader; - struct perf_event *sub; - struct perf_event *child_ctr; - - leader = inherit_event(parent_event, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { - child_ctr = inherit_event(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { @@ -5432,16 +5790,13 @@ __perf_event_exit_task(struct perf_event *child_event, } } -/* - * When a child task exits, feed back event values to parent events. - */ -void perf_event_exit_task(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *tmp; struct perf_event_context *child_ctx; unsigned long flags; - if (likely(!child->perf_event_ctxp)) { + if (likely(!child->perf_event_ctxp[ctxn])) { perf_event_task(child, NULL, 0); return; } @@ -5453,8 +5808,8 @@ void perf_event_exit_task(struct task_struct *child) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_event_ctxp; - __perf_event_task_sched_out(child_ctx); + child_ctx = child->perf_event_ctxp[ctxn]; + task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Take the context lock here so that if find_get_context is @@ -5462,7 +5817,7 @@ void perf_event_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); - child->perf_event_ctxp = NULL; + child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -5515,6 +5870,17 @@ again: put_ctx(child_ctx); } +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + perf_event_exit_task_context(child, ctxn); +} + static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { @@ -5536,48 +5902,166 @@ static void perf_free_event(struct perf_event *event, /* * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. + * perf_event_init_task below, used by fork() in case of fail. */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *ctx; struct perf_event *event, *tmp; + int ctxn; - if (!ctx) - return; + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; - mutex_lock(&ctx->mutex); + mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - perf_free_event(event, ctx); + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, + group_entry) + perf_free_event(event, ctx); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) - perf_free_event(event, ctx); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; - mutex_unlock(&ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(ctx); + put_ctx(ctx); + } +} + +void perf_event_delayed_put(struct task_struct *task) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + unsigned long flags; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, + child, + group_leader, parent_event, + NULL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + local64_set(&hwc->period_left, sample_period); + } + + child_event->ctx = child_ctx; + child_event->overflow_handler = parent_event->overflow_handler; + + /* + * Link it up in the child's context: + */ + raw_spin_lock_irqsave(&child_ctx->lock, flags); + add_event_to_ctx(child_event, child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; } static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, + struct task_struct *child, int ctxn, int *inherited_all) { int ret; - struct perf_event_context *child_ctx = child->perf_event_ctxp; + struct perf_event_context *child_ctx; if (!event->attr.inherit) { *inherited_all = 0; return 0; } + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -5586,14 +6070,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * child. */ - child_ctx = kzalloc(sizeof(struct perf_event_context), - GFP_KERNEL); + child_ctx = alloc_perf_context(event->pmu, child); if (!child_ctx) return -ENOMEM; - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); + child->perf_event_ctxp[ctxn] = child_ctx; } ret = inherit_group(event, parent, parent_ctx, @@ -5605,11 +6086,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return ret; } - /* * Initialize the perf_event context in task_struct */ -int perf_event_init_task(struct task_struct *child) +int perf_event_init_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -5618,19 +6098,19 @@ int perf_event_init_task(struct task_struct *child) int inherited_all = 1; int ret = 0; - child->perf_event_ctxp = NULL; + child->perf_event_ctxp[ctxn] = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_event_ctxp)) + if (likely(!parent->perf_event_ctxp[ctxn])) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent); + parent_ctx = perf_pin_task_context(parent, ctxn); /* * No need to check if parent_ctx != NULL here; since we saw @@ -5650,20 +6130,20 @@ int perf_event_init_task(struct task_struct *child) * the list, not manipulating it: */ list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, child, - &inherited_all); + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); if (ret) break; } list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, child, - &inherited_all); + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); if (ret) break; } - child_ctx = child->perf_event_ctxp; + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { /* @@ -5692,63 +6172,98 @@ int perf_event_init_task(struct task_struct *child) return ret; } +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + int ctxn, ret; + + for_each_task_context_nr(ctxn) { + ret = perf_event_init_context(child, ctxn); + if (ret) + return ret; + } + + return 0; +} + static void __init perf_event_init_all_cpus(void) { + struct swevent_htable *swhash; int cpu; - struct perf_cpu_context *cpuctx; for_each_possible_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - mutex_init(&cpuctx->hlist_mutex); - __perf_event_init_context(&cpuctx->ctx, NULL); + swhash = &per_cpu(swevent_htable, cpu); + mutex_init(&swhash->hlist_mutex); + INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); } } static void __cpuinit perf_event_init_cpu(int cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - - mutex_lock(&cpuctx->hlist_mutex); - if (cpuctx->hlist_refcount > 0) { + mutex_lock(&swhash->hlist_mutex); + if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; - hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); - WARN_ON_ONCE(!hlist); - rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); + WARN_ON(!hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); } - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_event_exit_cpu(void *info) +static void perf_pmu_rotate_stop(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + WARN_ON(!irqs_disabled()); + + list_del_init(&cpuctx->rotation_list); +} + +static void __perf_event_exit_context(void *__info) +{ + struct perf_event_context *ctx = __info; struct perf_event *event, *tmp; + perf_pmu_rotate_stop(ctx->pmu); + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) __perf_event_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) __perf_event_remove_from_context(event); } + +static void perf_event_exit_cpu_context(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + mutex_unlock(&ctx->mutex); + } + srcu_read_unlock(&pmus_srcu, idx); +} + static void perf_event_exit_cpu(int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_event_context *ctx = &cpuctx->ctx; + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - mutex_lock(&cpuctx->hlist_mutex); - swevent_hlist_release(cpuctx); - mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); + swevent_hlist_release(swhash); + mutex_unlock(&swhash->hlist_mutex); - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); + perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } @@ -5778,118 +6293,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - void __init perf_event_init(void) { perf_event_init_all_cpus(); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_events) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - raw_spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_events - cpuctx->ctx.nr_events, - perf_max_events - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - raw_spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_events", -}; - -static int __init perf_event_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); + init_srcu_struct(&pmus_srcu); + perf_pmu_register(&perf_swevent); + perf_pmu_register(&perf_cpu_clock); + perf_pmu_register(&perf_task_clock); + perf_tp_register(); + perf_cpu_notifier(perf_cpu_notify); } -device_initcall(perf_event_sysfs_init); diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d087..39b65b69584f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) @@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { + rcu_lockdep_assert(rcu_read_lock_held()); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca6066a6952e..29bff6117abc 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -86,6 +86,7 @@ config PM_SLEEP_SMP depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP + select HOTPLUG select HOTPLUG_CPU default y @@ -137,6 +138,8 @@ config SUSPEND_FREEZER config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + select LZO_COMPRESS + select LZO_DECOMPRESS select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually @@ -242,3 +245,17 @@ config PM_OPS bool depends on PM_SLEEP || PM_RUNTIME default y + +config PM_OPP + bool "Operating Performance Point (OPP) Layer library" + depends on PM + ---help--- + SOCs have a standard set of tuples consisting of frequency and + voltage pairs that the device will support per voltage domain. This + is called Operating Performance Point or OPP. The actual definitions + of OPP varies over silicon within the same family of devices. + + OPP layer organizes the data internally using device pointers + representing individual voltage domains and provides SOC + implementations a ready to use framework to manage OPPs. + For more information, read <file:Documentation/power/opp.txt> diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8dc31e02ae12..657272e91d0a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -29,6 +29,7 @@ #include "power.h" +static int nocompress = 0; static int noresume = 0; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; @@ -638,6 +639,8 @@ int hibernate(void) if (hibernation_mode == HIBERNATION_PLATFORM) flags |= SF_PLATFORM_MODE; + if (nocompress) + flags |= SF_NOCOMPRESS_MODE; pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); @@ -705,7 +708,7 @@ static int software_resume(void) goto Unlock; } - pr_debug("PM: Checking image partition %s\n", resume_file); + pr_debug("PM: Checking hibernation image partition %s\n", resume_file); /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); @@ -730,10 +733,10 @@ static int software_resume(void) } Check_image: - pr_debug("PM: Resume from partition %d:%d\n", + pr_debug("PM: Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - pr_debug("PM: Checking hibernation image.\n"); + pr_debug("PM: Looking for hibernation image.\n"); error = swsusp_check(); if (error) goto Unlock; @@ -765,14 +768,14 @@ static int software_resume(void) goto Done; } - pr_debug("PM: Reading hibernation image.\n"); + pr_debug("PM: Loading hibernation image.\n"); error = swsusp_read(&flags); swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); - printk(KERN_ERR "PM: Restore failed, recovering.\n"); + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); swsusp_free(); thaw_processes(); Done: @@ -785,7 +788,7 @@ static int software_resume(void) /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&pm_mutex); - pr_debug("PM: Resume from disk failed.\n"); + pr_debug("PM: Hibernation image not present or could not be loaded.\n"); return error; close_finish: swsusp_close(FMODE_READ); @@ -1004,6 +1007,15 @@ static int __init resume_offset_setup(char *str) return 1; } +static int __init hibernate_setup(char *str) +{ + if (!strncmp(str, "noresume", 8)) + noresume = 1; + else if (!strncmp(str, "nocompress", 10)) + nocompress = 1; + return 1; +} + static int __init noresume_setup(char *str) { noresume = 1; @@ -1013,3 +1025,4 @@ static int __init noresume_setup(char *str) __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); +__setup("hibernate=", hibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 62b0bc6e4983..7b5db6a8561e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -237,18 +237,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - unsigned long val; + unsigned int val; - return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; + return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - unsigned long val; + unsigned int val; - if (sscanf(buf, "%lu", &val) == 1) { + if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) return n; } @@ -281,12 +281,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, } power_attr(pm_trace); + +static ssize_t pm_trace_dev_match_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return show_trace_dev_match(buf, PAGE_SIZE); +} + +static ssize_t +pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} + +power_attr(pm_trace_dev_match); + #endif /* CONFIG_PM_TRACE */ static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, + &pm_trace_dev_match_attr.attr, #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, @@ -308,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { - pm_wq = create_freezeable_workqueue("pm"); + pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); return pm_wq ? 0 : -ENOMEM; } @@ -321,6 +339,7 @@ static int __init pm_init(void) int error = pm_start_workqueue(); if (error) return error; + hibernate_image_size_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index 006270fe382d..03634be55f62 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -14,6 +14,9 @@ struct swsusp_info { } __attribute__((aligned(PAGE_SIZE))); #ifdef CONFIG_HIBERNATION +/* kernel/power/snapshot.c */ +extern void __init hibernate_image_size_init(void); + #ifdef CONFIG_ARCH_HIBERNATION_HEADER /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) @@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info) extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); -#endif + +#else /* !CONFIG_HIBERNATION */ + +static inline void hibernate_image_size_init(void) {} +#endif /* !CONFIG_HIBERNATION */ extern int pfn_is_nosave(unsigned long); @@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void); * the image header. */ #define SF_PLATFORM_MODE 1 +#define SF_NOCOMPRESS_MODE 2 /* kernel/power/hibernate.c */ extern int swsusp_check(void); diff --git a/kernel/power/process.c b/kernel/power/process.c index 028a99598f49..e50b4c1b2a0f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only) struct timeval start, end; u64 elapsed_csecs64; unsigned int elapsed_csecs; + bool wakeup = false; do_gettimeofday(&start); @@ -78,6 +79,11 @@ static int try_to_freeze_tasks(bool sig_only) if (!todo || time_after(jiffies, end_time)) break; + if (!pm_check_wakeup_events()) { + wakeup = true; + break; + } + /* * We need to retry, but first give the freezing tasks some * time to enter the regrigerator. @@ -97,8 +103,9 @@ static int try_to_freeze_tasks(bool sig_only) * but it cleans up leftover PF_FREEZE requests. */ printk("\n"); - printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " + printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", + wakeup ? "aborted" : "failed", elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); @@ -107,7 +114,7 @@ static int try_to_freeze_tasks(bool sig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); - if (freezing(p) && !freezer_should_skip(p)) + if (!wakeup && freezing(p) && !freezer_should_skip(p)) sched_show_task(p); cancel_freezing(p); task_unlock(p); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d3f795f01bbc..ac7eb109f196 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *); * size will not exceed N bytes, but if that is impossible, it will * try to create the smallest image possible. */ -unsigned long image_size = 500 * 1024 * 1024; +unsigned long image_size; + +void __init hibernate_image_size_init(void) +{ + image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; +} /* List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been @@ -1318,12 +1323,14 @@ int hibernate_preallocate_memory(void) /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; + /* Compute the desired number of image pages specified by image_size. */ size = DIV_ROUND_UP(image_size, PAGE_SIZE); if (size > max_size) size = max_size; /* - * If the maximum is not less than the current number of saveable pages - * in memory, allocate page frames for the image and we're done. + * If the desired number of image pages is at least as large as the + * current number of saveable pages in memory, allocate page frames for + * the image and we're done. */ if (size >= saveable) { pages = preallocate_image_highmem(save_highmem); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e6a5bdf61a37..916eaa790399 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -24,10 +24,12 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/slab.h> +#include <linux/lzo.h> +#include <linux/vmalloc.h> #include "power.h" -#define SWSUSP_SIG "S1SUSPEND" +#define HIBERNATE_SIG "LINHIB0001" /* * The swap map is a data structure used for keeping track of each page @@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); - memcpy(swsusp_header->sig,SWSUSP_SIG, 10); + memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; error = hib_bio_write_page(swsusp_resume_block, @@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle, return error; } +/* We need to remember how much compressed data we need to read. */ +#define LZO_HEADER sizeof(size_t) + +/* Number of pages/bytes we'll compress at one time. */ +#define LZO_UNC_PAGES 32 +#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) + +/* Number of pages/bytes we need for compressed data (worst case). */ +#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ + LZO_HEADER, PAGE_SIZE) +#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) + /** * save_image - save the suspend image data */ @@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle, return ret; } + +/** + * save_image_lzo - Save the suspend image data compressed with LZO. + * @handle: Swap mam handle to use for saving the image. + * @snapshot: Image to read data from. + * @nr_to_write: Number of pages to save. + */ +static int save_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret = 0; + int nr_pages; + int err2; + struct bio *bio; + struct timeval start; + struct timeval stop; + size_t off, unc_len, cmp_len; + unsigned char *unc, *cmp, *wrk, *page; + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + return -ENOMEM; + } + + wrk = vmalloc(LZO1X_1_MEM_COMPRESS); + if (!wrk) { + printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); + free_page((unsigned long)page); + return -ENOMEM; + } + + unc = vmalloc(LZO_UNC_SIZE); + if (!unc) { + printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); + vfree(wrk); + free_page((unsigned long)page); + return -ENOMEM; + } + + cmp = vmalloc(LZO_CMP_SIZE); + if (!cmp) { + printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); + vfree(wrk); + free_page((unsigned long)page); + return -ENOMEM; + } + + printk(KERN_INFO + "PM: Compressing and saving image data (%u pages) ... ", + nr_to_write); + m = nr_to_write / 100; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + for (;;) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + + if (!off) + break; + + unc_len = off; + ret = lzo1x_1_compress(unc, unc_len, + cmp + LZO_HEADER, &cmp_len, wrk); + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + break; + } + + if (unlikely(!cmp_len || + cmp_len > lzo1x_worst_compress(unc_len))) { + printk(KERN_ERR "PM: Invalid LZO compressed length\n"); + ret = -1; + break; + } + + *(size_t *)cmp = cmp_len; + + /* + * Given we are writing one page at a time to disk, we copy + * that much from the buffer, although the last bit will likely + * be smaller than full page. This is OK - we saved the length + * of the compressed data, so any garbage at the end will be + * discarded when we read it. + */ + for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { + memcpy(page, cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } + } + +out_finish: + err2 = hib_wait_on_bio_chain(&bio); + do_gettimeofday(&stop); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_CONT "\b\b\b\bdone\n"); + else + printk(KERN_CONT "\n"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + + vfree(cmp); + vfree(unc); + vfree(wrk); + free_page((unsigned long)page); + + return ret; +} + /** * enough_swap - Make sure we have enough swap to save the image. * @@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle, * space avaiable from the resume partition. */ -static int enough_swap(unsigned int nr_pages) +static int enough_swap(unsigned int nr_pages, unsigned int flags) { unsigned int free_swap = count_swap_pages(root_swap, 1); + unsigned int required; pr_debug("PM: Free swap pages: %u\n", free_swap); - return free_swap > nr_pages + PAGES_FOR_IO; + + required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? + nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); + return free_swap > required; } /** @@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags) printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } - if (!enough_swap(pages)) { + if (!enough_swap(pages, flags)) { printk(KERN_ERR "PM: Not enough free swap\n"); error = -ENOSPC; goto out_finish; @@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags) } header = (struct swsusp_info *)data_of(snapshot); error = swap_write_page(&handle, header, NULL); - if (!error) - error = save_image(&handle, &snapshot, pages - 1); + if (!error) { + error = (flags & SF_NOCOMPRESS_MODE) ? + save_image(&handle, &snapshot, pages - 1) : + save_image_lzo(&handle, &snapshot, pages - 1); + } out_finish: error = swap_writer_finish(&handle, flags, error); return error; @@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle, } /** + * load_image_lzo - Load compressed image data and decompress them with LZO. + * @handle: Swap map handle to use for loading data. + * @snapshot: Image to copy uncompressed data into. + * @nr_to_read: Number of pages to load. + */ +static int load_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int error = 0; + struct timeval start; + struct timeval stop; + unsigned nr_pages; + size_t off, unc_len, cmp_len; + unsigned char *unc, *cmp, *page; + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + return -ENOMEM; + } + + unc = vmalloc(LZO_UNC_SIZE); + if (!unc) { + printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); + free_page((unsigned long)page); + return -ENOMEM; + } + + cmp = vmalloc(LZO_CMP_SIZE); + if (!cmp) { + printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); + free_page((unsigned long)page); + return -ENOMEM; + } + + printk(KERN_INFO + "PM: Loading and decompressing image data (%u pages) ... ", + nr_to_read); + m = nr_to_read / 100; + if (!m) + m = 1; + nr_pages = 0; + do_gettimeofday(&start); + + error = snapshot_write_next(snapshot); + if (error <= 0) + goto out_finish; + + for (;;) { + error = swap_read_page(handle, page, NULL); /* sync */ + if (error) + break; + + cmp_len = *(size_t *)page; + if (unlikely(!cmp_len || + cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR "PM: Invalid LZO compressed length\n"); + error = -1; + break; + } + + memcpy(cmp, page, PAGE_SIZE); + for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { + error = swap_read_page(handle, page, NULL); /* sync */ + if (error) + goto out_finish; + + memcpy(cmp + off, page, PAGE_SIZE); + } + + unc_len = LZO_UNC_SIZE; + error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, + unc, &unc_len); + if (error < 0) { + printk(KERN_ERR "PM: LZO decompression failed\n"); + break; + } + + if (unlikely(!unc_len || + unc_len > LZO_UNC_SIZE || + unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); + error = -1; + break; + } + + for (off = 0; off < unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + + error = snapshot_write_next(snapshot); + if (error <= 0) + goto out_finish; + } + } + +out_finish: + do_gettimeofday(&stop); + if (!error) { + printk("\b\b\b\bdone\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + error = -ENODATA; + } else + printk("\n"); + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + + vfree(cmp); + vfree(unc); + free_page((unsigned long)page); + + return error; +} + +/** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memeory location @@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p) goto end; if (!error) error = swap_read_page(&handle, header, NULL); - if (!error) - error = load_image(&handle, &snapshot, header->pages - 1); + if (!error) { + error = (*flags_p & SF_NOCOMPRESS_MODE) ? + load_image(&handle, &snapshot, header->pages - 1) : + load_image_lzo(&handle, &snapshot, header->pages - 1); + } swap_reader_finish(&handle); end: if (!error) @@ -640,7 +916,7 @@ int swsusp_check(void) if (error) goto put; - if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { + if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ error = hib_bio_write_page(swsusp_resume_block, @@ -653,13 +929,13 @@ put: if (error) blkdev_put(hib_resume_bdev, FMODE_READ); else - pr_debug("PM: Signature found, resuming\n"); + pr_debug("PM: Image signature found, resuming\n"); } else { error = PTR_ERR(hib_resume_bdev); } if (error) - pr_debug("PM: Error %d checking image file\n", error); + pr_debug("PM: Image not found (code %d)\n", error); return error; } diff --git a/kernel/printk.c b/kernel/printk.c index 8fe465ac008a..2531017795f6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); * provides serialisation for access to the entire console * driver system. */ -static DECLARE_MUTEX(console_sem); +static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); @@ -556,7 +556,7 @@ static void zap_locks(void) /* If a crash is occurring, make sure we can't deadlock */ spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ - init_MUTEX(&console_sem); + sema_init(&console_sem, 1); } #if defined(CONFIG_PRINTK_TIME) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb36..a23a57a976d1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); /** - * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * * Check for bottom half being disabled, which covers both the * CONFIG_PROVE_RCU and not cases. Note that if someone uses * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) - * will show the situation. + * will show the situation. This is useful for debug checks in functions + * that require that they be called within an RCU read-side critical + * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. */ @@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - return in_softirq(); + return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 196ec02f8be0..d806735342ac 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Forward declarations for rcutiny_plugin.h. */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp); + +#include "rcutiny_plugin.h" + #ifdef CONFIG_NO_HZ static long rcu_dynticks_nesting = 1; @@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); + rcu_preempt_check_callbacks(); } /* @@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; + rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); @@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, } /* - * Post an RCU callback to be invoked after the end of an RCU grace + * Post an RCU callback to be invoked after the end of an RCU-sched grace * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { __call_rcu(head, func, &rcu_sched_ctrlblk); } -EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Post an RCU bottom-half callback to be invoked after any subsequent @@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - void rcu_barrier_bh(void) { struct rcu_synchronize rcu; @@ -289,5 +286,3 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } - -#include "rcutiny_plugin.h" diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc742..6ceca4f745ff 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -1,7 +1,7 @@ /* - * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,11 +17,587 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright IBM Corporation, 2009 + * Copyright (c) 2010 Linaro * * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#ifdef CONFIG_TINY_PREEMPT_RCU + +#include <linux/delay.h> + +/* Global control variables for preemptible RCU. */ +struct rcu_preempt_ctrlblk { + struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ + struct rcu_head **nexttail; + /* Tasks blocked in a preemptible RCU */ + /* read-side critical section while an */ + /* preemptible-RCU grace period is in */ + /* progress must wait for a later grace */ + /* period. This pointer points to the */ + /* ->next pointer of the last task that */ + /* must wait for a later grace period, or */ + /* to &->rcb.rcucblist if there is no */ + /* such task. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is not such task. */ + struct list_head *exp_tasks; + /* Pointer to first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there cannot be any such task. */ + u8 gpnum; /* Current grace period. */ + u8 gpcpu; /* Last grace period blocked by the CPU. */ + u8 completed; /* Last grace period completed. */ + /* If all three are equal, RCU is idle. */ +}; + +static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { + .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), +}; + +static int rcu_preempted_readers_exp(void); +static void rcu_report_exp_done(void); + +/* + * Return true if the CPU has not yet responded to the current grace period. + */ +static int rcu_cpu_blocking_cur_gp(void) +{ + return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Check for a running RCU reader. Because there is only one CPU, + * there can be but one running RCU reader at a time. ;-) + */ +static int rcu_preempt_running_reader(void) +{ + return current->rcu_read_lock_nesting; +} + +/* + * Check for preempted RCU readers blocking any grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_any(void) +{ + return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); +} + +/* + * Check for preempted RCU readers blocking the current grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_cgp(void) +{ + return rcu_preempt_ctrlblk.gp_tasks != NULL; +} + +/* + * Return true if another preemptible-RCU grace period is needed. + */ +static int rcu_preempt_needs_another_gp(void) +{ + return *rcu_preempt_ctrlblk.rcb.curtail != NULL; +} + +/* + * Return true if a preemptible-RCU grace period is in progress. + * The caller must disable hardirqs. + */ +static int rcu_preempt_gp_in_progress(void) +{ + return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + * + * Because this is a single-CPU implementation, the only way a grace + * period can end is if the CPU is in a quiescent state. The reason is + * that a blocked preemptible-RCU reader can exit its critical section + * only if the CPU is running it at the time. Therefore, when the + * last task blocking the current grace period exits its RCU read-side + * critical section, neither the CPU nor blocked tasks will be stopping + * the current grace period. (In contrast, SMP implementations + * might have CPUs running in RCU read-side critical sections that + * block later grace periods -- but this is not possible given only + * one CPU.) + */ +static void rcu_preempt_cpu_qs(void) +{ + /* Record both CPU and task as having responded to current GP. */ + rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + + /* + * If there is no GP, or if blocked readers are still blocking GP, + * then there is nothing more to do. + */ + if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) + return; + + /* Advance callbacks. */ + rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; + rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; + + /* If there are no blocked readers, next GP is done instantly. */ + if (!rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; + + /* If there are done callbacks, make RCU_SOFTIRQ process them. */ + if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Start a new RCU grace period if warranted. Hard irqs must be disabled. + */ +static void rcu_preempt_start_gp(void) +{ + if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { + + /* Official start of GP. */ + rcu_preempt_ctrlblk.gpnum++; + + /* Any blocked RCU readers block new GP. */ + if (rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.gp_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + + /* If there is no running reader, CPU is done with GP. */ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + } +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * If the task started after the current grace period began, as recorded + * by ->gpcpu, we enqueue at the beginning of the list. Otherwise + * before the element referenced by ->gp_tasks (or at the tail if + * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the ->gp_tasks pointer becomes + * NULL. + * + * Caller must disable preemption. + */ +void rcu_preempt_note_context_switch(void) +{ + struct task_struct *t = current; + unsigned long flags; + + local_irq_save(flags); /* must exclude scheduler_tick(). */ + if (rcu_preempt_running_reader() && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + */ + list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); + if (rcu_cpu_blocking_cur_gp()) + rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that current grace period continues to be blocked. + */ + rcu_preempt_cpu_qs(); + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + unsigned long flags; + struct list_head *np; + int special; + + /* + * NMI handlers cannot block and cannot safely manipulate state. + * They therefore cannot possibly be special, so just leave. + */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) + rcu_preempt_cpu_qs(); + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the ->blkd_tasks list and adjust + * any pointers that might have been referencing it. + */ + empty = !rcu_preempt_blocked_readers_cgp(); + empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + list_del(&t->rcu_node_entry); + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) + rcu_preempt_ctrlblk.gp_tasks = np; + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) + rcu_preempt_ctrlblk.exp_tasks = np; + INIT_LIST_HEAD(&t->rcu_node_entry); + + /* + * If this was the last task on the current list, and if + * we aren't waiting on the CPU, report the quiescent state + * and start a new grace period if needed. + */ + if (!empty && !rcu_preempt_blocked_readers_cgp()) { + rcu_preempt_cpu_qs(); + rcu_preempt_start_gp(); + } + + /* + * If this was the last task on the expedited lists, + * then we need wake up the waiting task. + */ + if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_report_exp_done(); + } + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the rcu_preempt_ctrlblk structure, which is + * checked elsewhere. This is called from the scheduling-clock interrupt. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(void) +{ + struct task_struct *t = current; + + if (rcu_preempt_gp_in_progress() && + (!rcu_preempt_running_reader() || + !rcu_cpu_blocking_cur_gp())) + rcu_preempt_cpu_qs(); + if (&rcu_preempt_ctrlblk.rcb.rcucblist != + rcu_preempt_ctrlblk.rcb.donetail) + raise_softirq(RCU_SOFTIRQ); + if (rcu_preempt_gp_in_progress() && + rcu_cpu_blocking_cur_gp() && + rcu_preempt_running_reader()) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * TINY_PREEMPT_RCU has an extra callback-list tail pointer to + * update, so this is invoked from __rcu_process_callbacks() to + * handle that case. Of course, it is invoked for all flavors of + * RCU, but RCU callbacks can appear only on one of the lists, and + * neither ->nexttail nor ->donetail can possibly be NULL, so there + * is no need for an explicit check. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ + if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) + rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; +} + +/* + * Process callbacks for preemptible RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); +} + +/* + * Queue a preemptible -RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcu_preempt_ctrlblk.nexttail = head; + rcu_preempt_ctrlblk.nexttail = &head->next; + rcu_preempt_start_gp(); /* checks to see if GP needed. */ + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!rcu_scheduler_active) + return; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + if (!rcu_preempt_blocked_readers_any()) + return; + + /* Once we get past the fastpath checks, same code as rcu_barrier(). */ + rcu_barrier(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static unsigned long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(void) +{ + return rcu_preempt_ctrlblk.exp_tasks != NULL; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. + */ +static void rcu_report_exp_done(void) +{ + wake_up(&sync_rcu_preempt_exp_wq); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to rely in the fact that there is but one CPU, and that it is + * illegal for a task to invoke synchronize_rcu_expedited() while in a + * preemptible-RCU read-side critical section. Therefore, any such + * critical sections must correspond to blocked tasks, which must therefore + * be on the ->blkd_tasks list. So just record the current head of the + * list in the ->exp_tasks pointer, and wait for all tasks including and + * after the task pointed to by ->exp_tasks to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; + unsigned long snap; + + barrier(); /* ensure prior action seen before grace period. */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + + /* + * Acquire lock so that there is only one preemptible RCU grace + * period in flight. Of course, if someone does the expedited + * grace period for us while we are acquiring the lock, just leave. + */ + snap = sync_rcu_preempt_exp_count + 1; + mutex_lock(&sync_rcu_preempt_exp_mutex); + if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) + goto unlock_mb_ret; /* Others did our work for us. */ + + local_irq_save(flags); + + /* + * All RCU readers have to already be on blkd_tasks because + * we cannot legally be executing in an RCU read-side critical + * section. + */ + + /* Snapshot current head of ->blkd_tasks list. */ + rpcp->exp_tasks = rpcp->blkd_tasks.next; + if (rpcp->exp_tasks == &rpcp->blkd_tasks) + rpcp->exp_tasks = NULL; + local_irq_restore(flags); + + /* Wait for tail of ->blkd_tasks list to drain. */ + if (rcu_preempted_readers_exp()) + wait_event(sync_rcu_preempt_exp_wq, + !rcu_preempted_readers_exp()); + + /* Clean up and exit. */ + barrier(); /* ensure expedited GP seen before counter increment. */ + sync_rcu_preempt_exp_count++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); + barrier(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Does preemptible RCU need the CPU to stay out of dynticks mode? + */ +int rcu_preempt_needs_cpu(void) +{ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; +} + +/* + * Check for a task exiting while in a preemptible -RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(void) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to remove. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to process. + */ +static void rcu_preempt_process_callbacks(void) +{ +} + +#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include <linux/kernel_stat.h> diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e2726d790b9..9d8e8fb2515f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -120,7 +120,7 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current; +static struct rcu_torture __rcu *rcu_torture_current; static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); @@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ static int fullstop = FULLSTOP_RMMOD; -DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ - /* of kthreads. */ +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); /* * Detect and respond to a system shutdown. @@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) mdelay(longdelay_ms); if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); if (!delay) schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); } static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) @@ -731,7 +739,8 @@ rcu_torture_writer(void *arg) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_torture_current; + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5bc43976c5a..ccdc04c47981 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -143,6 +143,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; +module_param(rcu_cpu_stall_suppress, int, 0644); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_panicking __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) { @@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); - /* OK, time to rat on our buddy... */ - + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", rsp->name); rcu_for_each_leaf_node(rsp, rnp) { @@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); @@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) long delta; struct rcu_node *rnp; - if (rcu_cpu_stall_panicking) + if (rcu_cpu_stall_suppress) return; - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { - rcu_cpu_stall_panicking = 1; + rcu_cpu_stall_suppress = 1; return NOTIFY_DONE; } +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_preempt_stall_reset(); +} + static struct notifier_block rcu_panic_block = { .notifier_call = rcu_panic, }; @@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { } +void rcu_cpu_stall_reset(void) +{ +} + static void __init check_cpu_stall_init(void) { } @@ -712,7 +745,7 @@ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { @@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) { int i; - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ @@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rsp->orphan_qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ } @@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) struct rcu_data *rdp; raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); if (rsp->orphan_cbs_list == NULL) { raw_spin_unlock_irqrestore(&rsp->onofflock, flags); return; @@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; rdp->qlen += rsp->orphan_qlen; + rdp->n_cbs_adopted += rsp->orphan_qlen; rsp->orphan_cbs_list = NULL; rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; rsp->orphan_qlen = 0; @@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) unsigned long flags; unsigned long mask; int need_report = 0; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; + rdp->n_cbs_invoked += count; if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; @@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) + if ((rnp->qsmask & bit) != 0 && + f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } if (mask != 0) { @@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); rcu_process_gp_end(rsp, rdp); check_for_new_grace_period(rsp, rdp); @@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; int i; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp) +static void __init rcu_init_one(struct rcu_state *rsp, + struct rcu_data __percpu *rda) { static char *buf[] = { "rcu_node_level_0", "rcu_node_level_1", @@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } + rsp->rda = rda; rnp = rsp->level[NUM_RCU_LVLS - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; - rsp->rda[i]->mynode = rnp; + per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } } -/* - * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used - * nowhere else! Assigns leaf node pointers into each CPU's rcu_data - * structure. - */ -#define RCU_INIT_FLAVOR(rsp, rcu_data) \ -do { \ - int i; \ - \ - for_each_possible_cpu(i) { \ - (rsp)->rda[i] = &per_cpu(rcu_data, i); \ - } \ - rcu_init_one(rsp); \ -} while (0) - void __init rcu_init(void) { int cpu; rcu_bootup_announce(); - RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); - RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed0..91d4170c5c13 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -202,6 +202,9 @@ struct rcu_data { long qlen; /* # of queued callbacks */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -254,19 +257,23 @@ struct rcu_data { #define RCU_STALL_DELAY_DELTA 0 #endif -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ + RCU_STALL_DELAY_DELTA) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) /* for rsp->jiffies_stall */ #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ /* before ratting on them. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE +#define RCU_CPU_STALL_SUPPRESS_INIT 0 +#else +#define RCU_CPU_STALL_SUPPRESS_INIT 1 +#endif -#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) -#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* * RCU global state, including node hierarchy. This hierarchy is @@ -283,7 +290,7 @@ struct rcu_state { struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ - struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, #ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); +static void rcu_preempt_stall_reset(void); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d9..71a4147473f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU-based detection of stalled CPUs is disabled.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_VERBOSE +#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif #if NUM_RCU_LVL_4 != 0 @@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = rcu_preempt_state.rda[cpu]; + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) */ void __rcu_read_lock(void) { - ACCESS_ONCE(current->rcu_read_lock_nesting)++; + current->rcu_read_lock_nesting++; barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -344,7 +344,9 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); #ifdef CONFIG_PROVE_LOCKING @@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) } } +/* + * Suppress preemptible RCU's CPU stall warnings by pushing the + * time of the next stall-warning message comfortably far into the + * future. + */ +static void rcu_preempt_stall_reset(void) +{ + rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * * Control will return to the caller some time after a full grace * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. */ void synchronize_rcu(void) { @@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) */ static void __init __rcu_init_preempt(void) { - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); + rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } /* @@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { } +/* + * Because preemptible RCU does not exist, there is no need to suppress + * its CPU stall warnings. + */ +static void rcu_preempt_stall_reset(void) +{ +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void) } /* - * In classic RCU, call_rcu() is just call_rcu_sched(). - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - call_rcu_sched(head, func); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptable RCU does not exist, map to rcu-sched. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738e..d15430b9d122 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } #define PRINT_RCU_DATA(name, func, m) \ @@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%lu,%lu,%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } static int show_rcudata_csv(struct seq_file *m, void *unused) @@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ - seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); + seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU seq_puts(m, "\"rcu_preempt:\"\n"); PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); @@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) struct rcu_data *rdp; for_each_possible_cpu(cpu) { - rdp = rsp->rda[cpu]; + rdp = per_cpu_ptr(rsp->rda, cpu); if (rdp->beenonline) print_one_rcu_pending(m, rdp); } diff --git a/kernel/sched.c b/kernel/sched.c index dc85ceb90832..d42992bccdfa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -426,9 +426,7 @@ struct root_domain { */ cpumask_var_t rto_mask; atomic_t rto_count; -#ifdef CONFIG_SMP struct cpupri cpupri; -#endif }; /* @@ -437,7 +435,7 @@ struct root_domain { */ static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */ /* * This is the main, per-CPU runqueue data structure. @@ -488,11 +486,12 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr, *idle; + struct task_struct *curr, *idle, *stop; unsigned long next_balance; struct mm_struct *prev_mm; u64 clock; + u64 clock_task; atomic_t nr_iowait; @@ -520,6 +519,10 @@ struct rq { u64 avg_idle; #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif + /* calc_load related fields */ unsigned long calc_load_update; long calc_load_active; @@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ +static u64 irq_time_cpu(int cpu); +static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); + inline void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) - rq->clock = sched_clock_cpu(cpu_of(rq)); + if (!rq->skip_clock_update) { + int cpu = cpu_of(rq); + u64 irq_time; + + rq->clock = sched_clock_cpu(cpu); + irq_time = irq_time_cpu(cpu); + if (rq->clock - irq_time > rq->clock_task) + rq->clock_task = rq->clock - irq_time; + + sched_irq_time_avg_update(rq, irq_time); + } } /* @@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(buf, "NO_", 3) == 0) { neg = 1; @@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else @@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { - p->se.load.weight = 0; - p->se.load.inv_weight = WMULT_CONST; - return; - } - /* * SCHED_IDLE tasks get minimal weight: */ @@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dec_nr_running(rq); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value (or semi updated value on 32 bit) with a side effect of + * accounting a slice of irq time to wrong task when irq is in progress + * while we read rq->clock. That is a worthy compromise in place of having + * locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +static u64 irq_time_cpu(int cpu) +{ + if (!sched_clock_irqtime) + return 0; + + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} + +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + int cpu; + u64 now, delta; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + now = sched_clock_cpu(cpu); + delta = now - per_cpu(irq_start_time, cpu); + per_cpu(irq_start_time, cpu) = now; + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + per_cpu(cpu_hardirq_time, cpu) += delta; + else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + per_cpu(cpu_softirq_time, cpu) += delta; + + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) +{ + if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { + u64 delta_irq = curr_irq_time - rq->prev_irq_time; + rq->prev_irq_time = curr_irq_time; + sched_rt_avg_update(rq, delta_irq); + } +} + +#else + +static u64 irq_time_cpu(int cpu) +{ + return 0; +} + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } + +#endif + #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + /* * __normal_prio - return the priority that is based on the static prio */ @@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (p->sched_class != &fair_sched_class) return 0; + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (likely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (likely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; rq->prev_mm = oldmm; } @@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock - p->se.exec_start; + ns = rq->clock_task - p->se.exec_start; if ((s64)ns < 0) ns = 0; } @@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (in_serving_softirq()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else cpustat->system = cputime64_add(cpustat->system, tmp); @@ -3584,7 +3714,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr); + perf_event_task_tick(); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq) return p; } - class = sched_class_highest; - for ( ; ; ) { + for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; } + + BUG(); /* the idle class will always have a runnable task */ } /* @@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = task_rq_lock(p, &flags); + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->se.on_rq; @@ -4645,7 +4772,7 @@ recheck: } if (user) { - retval = security_task_setscheduler(p, policy, param); + retval = security_task_setscheduler(p); if (retval) return retval; } @@ -4661,6 +4788,15 @@ recheck: */ rq = __task_rq_lock(p); + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EINVAL; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* @@ -4887,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p); if (retval) goto out_unlock; cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); - again: +again: retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { @@ -5337,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) @@ -6514,6 +6662,7 @@ struct s_data { cpumask_var_t nodemask; cpumask_var_t this_sibling_map; cpumask_var_t this_core_map; + cpumask_var_t this_book_map; cpumask_var_t send_covered; cpumask_var_t tmpmask; struct sched_group **sched_group_nodes; @@ -6525,6 +6674,7 @@ enum s_alloc { sa_rootdomain, sa_tmpmask, sa_send_covered, + sa_this_book_map, sa_this_core_map, sa_this_sibling_map, sa_nodemask, @@ -6560,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct static_sched_domain, core_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int cpu_to_core_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; - +#ifdef CONFIG_SCHED_SMT cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); +#else + group = cpu; +#endif if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } -#elif defined(CONFIG_SCHED_MC) +#endif /* CONFIG_SCHED_MC */ + +/* + * book sched-domains: + */ +#ifdef CONFIG_SCHED_BOOK +static DEFINE_PER_CPU(struct static_sched_domain, book_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); + static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) +cpu_to_book_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { + int group = cpu; +#ifdef CONFIG_SCHED_MC + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); + group = cpumask_first(mask); +#endif if (sg) - *sg = &per_cpu(sched_group_core, cpu).sg; - return cpu; + *sg = &per_cpu(sched_group_book, group).sg; + return group; } -#endif +#endif /* CONFIG_SCHED_BOOK */ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); @@ -6594,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; -#ifdef CONFIG_SCHED_MC +#ifdef CONFIG_SCHED_BOOK + cpumask_and(mask, cpu_book_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_MC) cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) @@ -6855,6 +7025,9 @@ SD_INIT_FUNC(CPU) #ifdef CONFIG_SCHED_MC SD_INIT_FUNC(MC) #endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif static int default_relax_domain_level = -1; @@ -6904,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_book_map: + free_cpumask_var(d->this_book_map); /* fall through */ case sa_this_core_map: free_cpumask_var(d->this_core_map); /* fall through */ case sa_this_sibling_map: @@ -6950,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_nodemask; if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) return sa_this_core_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_book_map; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; d->rd = alloc_rootdomain(); @@ -7009,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_book_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_BOOK + sd = &per_cpu(book_domains, i).sd; + SD_INIT(sd, BOOK); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + static struct sched_domain *__build_mc_sched_domain(struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *parent, int i) @@ -7066,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #endif +#ifdef CONFIG_SCHED_BOOK + case SD_LV_BOOK: /* set up book groups */ + cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); + if (cpu == cpumask_first(d->this_book_map)) + init_sched_build_groups(d->this_book_map, cpu_map, + &cpu_to_book_group, + d->send_covered, d->tmpmask); + break; +#endif case SD_LV_CPU: /* set up physical groups */ cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); if (!cpumask_empty(d->nodemask)) @@ -7113,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); } @@ -7149,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, init_sched_groups_power(i, sd); } #endif +#ifdef CONFIG_SCHED_BOOK + for_each_cpu(i, cpu_map) { + sd = &per_cpu(book_domains, i).sd; + init_sched_groups_power(i, sd); + } +#endif for_each_cpu(i, cpu_map) { sd = &per_cpu(phys_domains, i).sd; @@ -7174,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) sd = &per_cpu(core_domains, i).sd; +#elif defined(CONFIG_SCHED_BOOK) + sd = &per_cpu(book_domains, i).sd; #else sd = &per_cpu(phys_domains, i).sd; #endif @@ -8078,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(cfs_rq); - err: +err: return 0; } @@ -8168,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(rt_rq); - err: +err: return 0; } @@ -8528,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg, raw_spin_unlock(&rt_rq->rt_runtime_lock); } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: +unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index db3f674ca49d..933f3d1b62ea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -25,7 +25,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 750000ULL; unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; @@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock; + u64 now = rq_of(cfs_rq)->clock_task; unsigned long delta_exec; if (unlikely(!curr)) @@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock; + se->exec_start = rq_of(cfs_rq)->clock_task; } /************************************************** @@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); + + /* re-arm NEWIDLE balancing when moving tasks */ + src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; + this_rq->idle_stamp = 0; } /* @@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock, sd); + tsk_cache_hot = task_hot(p, rq->clock_task, sd); if (!tsk_cache_hot || sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS @@ -2030,12 +2034,14 @@ struct sd_lb_stats { unsigned long this_load; unsigned long this_load_per_task; unsigned long this_nr_running; + unsigned long this_has_capacity; /* Statistics of the busiest group */ unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; + unsigned long busiest_has_capacity; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2058,6 +2064,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; int group_imb; /* Is there an imbalance in the group ? */ + int group_has_capacity; /* Is there extra capacity in the group? */ }; /** @@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu) u64 total, available; total = sched_avg_period() + (rq->clock - rq->age_stamp); - available = total - rq->rt_avg; + + if (unlikely(total < rq->rt_avg)) { + /* Ensures that power won't end up being negative */ + available = 0; + } else { + available = total - rq->rt_avg; + } if (unlikely((s64)total < SCHED_LOAD_SCALE)) total = SCHED_LOAD_SCALE; @@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { - unsigned long load, max_cpu_load, min_cpu_load; + unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; int i; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; @@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; min_cpu_load = ~0UL; + max_nr_running = 0; for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); @@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - if (load > max_cpu_load) + if (load > max_cpu_load) { max_cpu_load = load; + max_nr_running = rq->nr_running; + } if (min_cpu_load > load) min_cpu_load = load; } @@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = - DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); + + if (sgs->group_capacity > sgs->sum_nr_running) + sgs->group_has_capacity = 1; } /** @@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try - * and move all the excess tasks away. + * and move all the excess tasks away. We lower the capacity + * of a group only if the local group has the capacity to fit + * these excess tasks, i.e. nr_running < group_capacity. The + * extra check prevents the case where you always pull from the + * heaviest group when it is already under-utilized (possible + * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling) + if (prefer_sibling && !local_group && sds->this_has_capacity) sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { @@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; + sds->this_has_capacity = sgs.group_has_capacity; } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->busiest_has_capacity = sgs.group_has_capacity; sds->group_imb = sgs.group_imb; } @@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, return fix_small_imbalance(sds, this_cpu, imbalance); } + /******* find_busiest_group() helpers end here *********************/ /** @@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * 4) This group is more busy than the avg busieness at this * sched_domain. * 5) The imbalance is within the specified limit. + * + * Note: when doing newidle balance, if the local group has excess + * capacity (i.e. nr_running < group_capacity) and the busiest group + * does not have any capacity, we force a load balance to pull tasks + * to the local group. In this case, we skip past checks 3, 4 and 5. */ if (!(*balance)) goto ret; @@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && + !sds.busiest_has_capacity) + goto force_balance; + if (sds.this_load >= sds.max_load) goto out_balanced; @@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; +force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; @@ -3031,7 +3068,14 @@ redo: if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); - sd->nr_balance_failed++; + /* + * Increment the failure counter only on periodic balance. + * We do not want newidle balance, which can be very + * frequent, pollute the failure counter causing + * excessive cache_hot migrations and active balances. + */ + if (idle != CPU_NEWLY_IDLE) + sd->nr_balance_failed++; if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), this_cpu)) { @@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + if (pulled_task) break; - } } raw_spin_lock(&this_rq->lock); @@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p) update_rq_clock(rq); - if (unlikely(task_cpu(p) != this_cpu)) + if (unlikely(task_cpu(p) != this_cpu)) { + rcu_read_lock(); __set_task_cpu(p, this_cpu); + rcu_read_unlock(); + } update_curr(cfs_rq); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 83c66e8ad3ee..185f920ec1a2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1) * release the lock. Decreases scheduling overhead. */ SCHED_FEAT(OWNER_SPIN, 1) + +/* + * Decrement CPU power based on irq activity + */ +SCHED_FEAT(NONIRQ_POWER, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d10c80ebb67a..bea7d79f7e9c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq) if (!task_has_rt_policy(curr)) return; - delta_exec = rq->clock - curr->se.exec_start; + delta_exec = rq->clock_task - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; @@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock; + curr->se.exec_start = rq->clock_task; cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * runqueue. Otherwise simply start this RT task * on its current runqueue. * - * We want to avoid overloading runqueues. Even if - * the RT task is of higher priority than the current RT task. - * RT tasks behave differently than other tasks. If - * one gets preempted, we try to push it off to another queue. - * So trying to keep a preempting RT task on the same - * cache hot CPU will force the running RT task to - * a cold CPU. So we waste all the cache for the lower - * RT task in hopes of saving some of a RT task - * that is just being woken and probably will have - * cold cache anyway. + * We want to avoid overloading runqueues. If the woken + * task is a higher priority, then it will stay on this CPU + * and the lower prio task should be moved to another CPU. + * Even though this will probably make the lower prio task + * lose its cache, we do not want to bounce a higher task + * around just because it gave up its CPU, perhaps for a + * lock? + * + * For equal prio tasks, we just let the scheduler sort it out. */ if (unlikely(rt_task(rq->curr)) && + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { int cpu = find_lowest_rq(p); @@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock; + p->se.exec_start = rq->clock_task; return p; } @@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) for_each_leaf_rt_rq(rt_rq, rq) { array = &rt_rq->active; idx = sched_find_first_bit(array->bitmap); - next_idx: +next_idx: if (idx >= MAX_RT_PRIO) continue; if (next && next->prio < idx) @@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; - retry: +retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); return 0; @@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq) * but possible) */ } - skip: +skip: double_unlock_balance(this_rq, src_rq); } @@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1) + p->rt.nr_cpus_allowed > 1 && + rt_task(rq->curr) && + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio < p->prio)) push_rt_tasks(rq); } @@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock; + p->se.exec_start = rq->clock_task; /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c new file mode 100644 index 000000000000..45bddc0c1048 --- /dev/null +++ b/kernel/sched_stoptask.c @@ -0,0 +1,108 @@ +/* + * stop-task scheduling class. + * + * The stop task is the highest priority task in the system, it preempts + * everything and will be preempted by nothing. + * + * See kernel/stop_machine.c + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_stop(struct rq *rq, struct task_struct *p, + int sd_flag, int flags) +{ + return task_cpu(p); /* stop tasks as never migrate */ +} +#endif /* CONFIG_SMP */ + +static void +check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +{ + resched_task(rq->curr); /* we preempt everything */ +} + +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *stop = rq->stop; + + if (stop && stop->state == TASK_RUNNING) + return stop; + + return NULL; +} + +static void +enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void +dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void yield_task_stop(struct rq *rq) +{ + BUG(); /* the stop task should never yield, its pointless. */ +} + +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_stop(struct rq *rq) +{ +} + +static void switched_to_stop(struct rq *rq, struct task_struct *p, + int running) +{ + BUG(); /* its impossible to change to this class */ +} + +static void prio_changed_stop(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + BUG(); /* how!?, what priority? */ +} + +static unsigned int +get_rr_interval_stop(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +/* + * Simple, special scheduling class for the per-CPU stop tasks: + */ +static const struct sched_class stop_sched_class = { + .next = &rt_sched_class, + + .enqueue_task = enqueue_task_stop, + .dequeue_task = dequeue_task_stop, + .yield_task = yield_task_stop, + + .check_preempt_curr = check_preempt_curr_stop, + + .pick_next_task = pick_next_task_stop, + .put_prev_task = put_prev_task_stop, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_stop, +#endif + + .set_curr_task = set_curr_task_stop, + .task_tick = task_tick_stop, + + .get_rr_interval = get_rr_interval_stop, + + .prio_changed = prio_changed_stop, + .switched_to = switched_to_stop, + + /* no .task_new for stop tasks */ +}; diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73a..e33fd71ed66a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -77,11 +77,21 @@ void wakeup_softirqd(void) } /* + * preempt_count and SOFTIRQ_OFFSET usage: + * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving + * softirq processing. + * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * on local_bh_disable or local_bh_enable. + * This lets us distinguish between whether we are currently processing + * softirq and whether we just have bh disabled. + */ + +/* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip) +static void __local_bh_disable(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip) * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - preempt_count() += SOFTIRQ_OFFSET; + preempt_count() += cnt; /* * Were softirqs turned off above: */ - if (softirq_count() == SOFTIRQ_OFFSET) + if (softirq_count() == cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == SOFTIRQ_OFFSET) + if (preempt_count() == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } #else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip) +static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) { - add_preempt_count(SOFTIRQ_OFFSET); + add_preempt_count(cnt); barrier(); } #endif /* CONFIG_TRACE_IRQFLAGS */ void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0)); + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); +static void __local_bh_enable(unsigned int cnt) +{ + WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (softirq_count() == cnt) + trace_softirqs_on((unsigned long)__builtin_return_address(0)); + sub_preempt_count(cnt); +} + /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), @@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable); */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); - WARN_ON_ONCE(!irqs_disabled()); - - if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); - sub_preempt_count(SOFTIRQ_OFFSET); + __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(_local_bh_enable); @@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) /* * Are softirqs going to be turned on now: */ - if (softirq_count() == SOFTIRQ_OFFSET) + if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) trace_softirqs_on(ip); /* * Keep preemption disabled until we are done with * softirq processing: */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); + sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); @@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); @@ -212,18 +229,20 @@ restart: do { if (pending & 1) { + unsigned int vec_nr = h - softirq_vec; int prev_count = preempt_count(); - kstat_incr_softirqs_this_cpu(h - softirq_vec); - trace_softirq_entry(h, softirq_vec); + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); h->action(h); - trace_softirq_exit(h, softirq_vec); + trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %s %p" + printk(KERN_ERR "huh, entered softirq %u %s %p" "with preempt_count %08x," - " exited with %08x?\n", h - softirq_vec, - softirq_to_name[h - softirq_vec], - h->action, prev_count, preempt_count()); + " exited with %08x?\n", vec_nr, + softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); preempt_count() = prev_count; } @@ -245,7 +264,7 @@ restart: lockdep_softirq_exit(); account_system_vtime(current); - _local_bh_enable(); + __local_bh_enable(SOFTIRQ_OFFSET); } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -279,10 +298,16 @@ void irq_enter(void) rcu_irq_enter(); if (idle_cpu(cpu) && !in_interrupt()) { - __irq_enter(); + /* + * Prevent raise_softirq from needlessly waking up ksoftirqd + * here, as softirq will be serviced on return from interrupt. + */ + local_bh_disable(); tick_check_idle(cpu); - } else - __irq_enter(); + _local_bh_enable(); + } + + __irq_enter(); } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -696,6 +721,7 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); + current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { @@ -886,17 +912,14 @@ int __init __weak early_irq_init(void) return 0; } +#ifdef CONFIG_GENERIC_HARDIRQS int __init __weak arch_probe_nr_irqs(void) { - return 0; + return NR_IRQS_LEGACY; } int __init __weak arch_early_irq_init(void) { return 0; } - -int __weak arch_init_chip_data(struct irq_desc *desc, int node) -{ - return 0; -} +#endif diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd509..c71e07500536 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) int __init_srcu_struct(struct srcu_struct *sp, const char *name, struct lock_class_key *key) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ return init_srcu_struct_fields(sp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4372ccb25127..090c28812ce1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -287,11 +287,12 @@ repeat: goto repeat; } +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; unsigned int cpu = (unsigned long)hcpu; struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct task_struct *p; @@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, cpu); if (IS_ERR(p)) return NOTIFY_BAD; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); get_task_struct(p); + kthread_bind(p, cpu); + sched_set_stop_task(cpu, p); stopper->thread = p; break; case CPU_ONLINE: - kthread_bind(stopper->thread, cpu); /* strictly unnecessary, as first user will wake it */ wake_up_process(stopper->thread); /* mark enabled */ @@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, { struct cpu_stop_work *work; + sched_set_stop_task(cpu, NULL); /* kill the stopper */ kthread_stop(stopper->thread); /* drain remaining works */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bad369ec5403..c782fe9924c7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg); cond_syscall(sys_recvmsg); cond_syscall(sys_recvmmsg); cond_syscall(compat_sys_recvmsg); +cond_syscall(compat_sys_recv); cond_syscall(compat_sys_recvfrom); cond_syscall(compat_sys_recvmmsg); cond_syscall(sys_socketcall); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 4f104515a19b..f8b11a283171 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -115,7 +115,9 @@ static int test_kprobes(void) int ret; struct kprobe *kps[2] = {&kp, &kp2}; - kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + kp.addr = NULL; + kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -210,7 +212,9 @@ static int test_jprobes(void) int ret; struct jprobe *jps[2] = {&jp, &jp2}; - jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + jp.kp.addr = NULL; + jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -323,7 +327,9 @@ static int test_kretprobes(void) int ret; struct kretprobe *rps[2] = {&rp, &rp2}; - rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + rp.kp.addr = NULL; + rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c63116863a80..d2321891538f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -149,10 +149,18 @@ static void ntp_update_offset(long offset) time_reftime = get_seconds(); offset64 = offset; - freq_adj = (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + freq_adj = ntp_update_offset_fll(offset64, secs); - freq_adj += ntp_update_offset_fll(offset64, secs); + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); diff --git a/kernel/timer.c b/kernel/timer.c index 97bf05baade7..68a9ae7679b7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_event.h> +#include <linux/irq_work.h> #include <linux/sched.h> #include <linux/slab.h> @@ -1279,7 +1279,10 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); - perf_event_do_pending(); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_run(); +#endif scheduler_tick(); run_posix_cpu_timers(p); } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 538501c6ea50..e04b8bcdef88 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_C_RECORDMCOUNT + bool + help + C version of recordmcount available? + config TRACER_MAX_TRACE bool @@ -121,7 +126,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER + select FRAME_POINTER if (!ARM_UNWIND) select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa7ece649fe1..ebd80d50c474 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -884,10 +884,8 @@ enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), - FTRACE_START_FUNC_RET = (1 << 5), - FTRACE_STOP_FUNC_RET = (1 << 6), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), }; static int ftrace_filtered; @@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command) static void ftrace_startup_sysctl(void) { - int command = FTRACE_ENABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; @@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - command |= FTRACE_ENABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_ENABLE_CALLS); } static void ftrace_shutdown_sysctl(void) { - int command = FTRACE_DISABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ if (ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_DISABLE_CALLS); } static cycle_t ftrace_update_time; @@ -1368,24 +1358,29 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - struct ftrace_page *pg; - int hidx; - int idx; - unsigned flags; - struct trace_parser parser; + loff_t pos; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + int hidx; + int idx; + unsigned flags; }; static void * -t_hash_next(struct seq_file *m, void *v, loff_t *pos) +t_hash_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct hlist_node *hnd = v; + struct hlist_node *hnd = NULL; struct hlist_head *hhd; - WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); - (*pos)++; + iter->pos = *pos; + if (iter->probe) + hnd = &iter->probe->node; retry: if (iter->hidx >= FTRACE_FUNC_HASHSIZE) return NULL; @@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) } } - return hnd; + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; } static void *t_hash_start(struct seq_file *m, loff_t *pos) @@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_HASH)) - *pos = 0; - - iter->flags |= FTRACE_ITER_HASH; + if (iter->func_pos > *pos) + return NULL; iter->hidx = 0; - for (l = 0; l <= *pos; ) { - p = t_hash_next(m, p, &l); + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_hash_next(m, &l); if (!p) break; } - return p; + if (!p) + return NULL; + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_HASH; + + return iter; } -static int t_hash_show(struct seq_file *m, void *v) +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) { struct ftrace_func_probe *rec; - struct hlist_node *hnd = v; - rec = hlist_entry(hnd, struct ftrace_func_probe, node); + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); @@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct dyn_ftrace *rec = NULL; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, v, pos); + return t_hash_next(m, pos); (*pos)++; + iter->pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) - return NULL; + return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } - return rec; + if (!rec) + return t_hash_start(m, pos); + + iter->func_pos = *pos; + iter->func = rec; + + return iter; +} + +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -1502,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) mutex_lock(&ftrace_lock); /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + + /* * For set_ftrace_filter reading, if we have the filter * off, we can short cut and just print out that all * functions are enabled. @@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { @@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p && iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); + if (!p) { + if (iter->flags & FTRACE_ITER_FILTER) + return t_hash_start(m, pos); - return p; + return NULL; + } + + return iter; } static void t_stop(struct seq_file *m, void *p) @@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec = v; + struct dyn_ftrace *rec; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, v); + return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { seq_printf(m, "#### all functions enabled ####\n"); return 0; } + rec = iter->func; + if (!rec) return 0; @@ -1601,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) ret = ftrace_avail_open(inode, file); if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; + m = file->private_data; + iter = m->private; iter->flags = FTRACE_ITER_FAILURES; } @@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = no_llseek, + .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ad25490f8b40..ec5c71005c14 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2615,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer @@ -2623,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - - cpu_buffer->read; - return ret; + return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -2693,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += (local_read(&cpu_buffer->entries) - - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; + entries += rb_num_of_entries(cpu_buffer); } return entries; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..82d9b8106cd0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; @@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); struct dentry *d_cpu; - /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ - char cpu_dir[7]; + char cpu_dir[30]; /* 30 characters should be more than enough */ - if (cpu > 999 || cpu < 0) - return; - - sprintf(cpu_dir, "cpu%ld", cpu); + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a5..9021f8c0c0c3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 31cc4cb0dbf2..39c059ca670e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,7 +9,7 @@ #include <linux/kprobes.h> #include "trace.h" -static char *perf_trace_buf[4]; +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -24,7 +24,7 @@ static int total_ref_count; static int perf_trace_event_init(struct ftrace_event_call *tp_event, struct perf_event *p_event) { - struct hlist_head *list; + struct hlist_head __percpu *list; int ret = -ENOMEM; int cpu; @@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, tp_event->perf_events = list; if (!total_ref_count) { - char *buf; + char __percpu *buf; int i; - for (i = 0; i < 4; i++) { - buf = (char *)alloc_percpu(perf_trace_t); + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + buf = (char __percpu *)alloc_percpu(perf_trace_t); if (!buf) goto fail; @@ -65,7 +65,7 @@ fail: if (!total_ref_count) { int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } @@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event) return ret; } -int perf_trace_enable(struct perf_event *p_event) +int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; struct hlist_head *list; - list = tp_event->perf_events; - if (WARN_ON_ONCE(!list)) + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; - list = this_cpu_ptr(list); + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + + list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; } -void perf_trace_disable(struct perf_event *p_event) +void perf_trace_del(struct perf_event *p_event, int flags) { hlist_del_rcu(&p_event->hlist_entry); } @@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->perf_events = NULL; if (!--total_ref_count) { - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..398c0e8b332c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -600,21 +600,29 @@ out: enum { FORMAT_HEADER = 1, - FORMAT_PRINTFMT = 2, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_event_call *call = m->private; struct ftrace_event_field *field; - struct list_head *head; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - head = &ftrace_common_fields; + if (unlikely(list_empty(common_head))) + return NULL; + + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; + case FORMAT_FIELD_SEPERATOR: if (unlikely(list_empty(head))) return NULL; @@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) return NULL; } - head = trace_get_fields(call); - - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it in case. - */ - v = (void *)((unsigned long)v & ~1L); - field = v; - /* - * If this is a common field, and at the end of the list, then - * continue with main list. - */ - if (field->link.prev == &ftrace_common_fields) { - if (unlikely(list_empty(head))) - return NULL; - field = list_entry(head->prev, struct ftrace_event_field, link); - /* Set the LSB to notify f_show to print an extra newline */ - field = (struct ftrace_event_field *) - ((unsigned long)field | 1); - return field; - } - - /* If we are done tell f_show to print the format */ - if (field->link.prev == head) + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) return (void *)FORMAT_PRINTFMT; field = list_entry(field->link.prev, struct ftrace_event_field, link); @@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v) seq_printf(m, "format:\n"); return 0; + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it and - * print a newline if it is set. - */ - if ((unsigned long)v & 1) { - seq_putc(m, '\n'); - v = (void *)((unsigned long)v & ~1L); - } - field = v; /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6f233698518e..76b05980225c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -15,15 +15,19 @@ #include "trace.h" #include "trace_output.h" +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + struct fgraph_cpu_data { pid_t last_pid; int depth; + int depth_irq; int ignore; unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; struct fgraph_data { - struct fgraph_cpu_data *cpu_data; + struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ struct ftrace_graph_ent_entry ent; @@ -41,6 +45,7 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, /* Display absolute time of an entry */ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns and proc by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts }; @@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, return 1; } +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs) + return 0; + + return in_irq(); +} + int trace_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = graph_array; @@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func))) + if (!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) return 0; local_irq_save(flags); @@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, ip, flags, pc); +} + void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", - nsecs_rem); + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return 0; } +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just extered irq code + * + * retunns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) @@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t ret; int cpu = iter->cpu; + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) return TRACE_TYPE_PARTIAL_LINE; @@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; int i; + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + if (data) { struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; @@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function_flags(struct trace_iterator *iter, u32 flags) +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); } static enum print_line_t @@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s|||| / \n", size, spaces); } -void print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) { int lat = trace_flags & TRACE_ITER_LATENCY_FMT; @@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) print_graph_headers_flags(s, tracer_flags.val); } +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ @@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter) pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + *pid = -1; *depth = 0; *ignore = 0; + *depth_irq = -1; } iter->private = data; @@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter) } } +static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + return 0; +} + static struct trace_event_functions graph_functions = { .trace = print_graph_function_event, }; @@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = { .print_line = print_graph_function, .print_header = print_graph_headers, .flags = &tracer_flags, + .set_flag = func_graph_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function_graph, #endif diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2e..5cf8c602b880 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) { - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; @@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) */ cpu = raw_smp_processor_id(); if (likely(!per_cpu(tracing_cpu, cpu))) - return; + return 0; - local_save_flags(flags); + local_save_flags(*flags); /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; + if (!irqs_disabled_flags(*flags)) + return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, preempt_count()); + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; int ret; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return 0; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return 0; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else - ret = 0; - + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); atomic_dec(&data->disabled); + return ret; } @@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); atomic_dec(&data->disabled); } @@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - /* * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ if (is_graph()) - return print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; } static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) { - struct trace_iterator *iter = s->private; - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - print_graph_headers_flags(s, flags); - } else + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else trace_default_header(s); } static void -trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) -{ - u64 time = trace_clock_local(); - struct ftrace_graph_ent ent = { - .func = ip, - .depth = 0, - }; - struct ftrace_graph_ret ret = { - .func = ip, - .depth = 0, - .calltime = time, - .rettime = time, - }; - - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); -} - -static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (!is_graph()) + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else trace_function(tr, ip, parent_ip, flags, pc); - else { - trace_graph_function(tr, parent_ip, flags, pc); - trace_graph_function(tr, ip, flags, pc); - } } #else diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 544301d29dee..b8d2852baa4a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -648,7 +648,7 @@ static int register_trace_probe(struct trace_probe *tp) } ret = register_probe_event(tp); if (ret) { - pr_warning("Faild to register probe event(%d)\n", ret); + pr_warning("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81b..7319559ed59f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,48 +31,98 @@ static int wakeup_rt; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_lat_flag; +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + #ifdef CONFIG_FUNCTION_TRACER + /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) { - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; - int pc; if (likely(!wakeup_task)) - return; + return 0; - pc = preempt_count(); + *pc = preempt_count(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) goto out_enable; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; - local_irq_save(flags); + return 1; - trace_function(tr, ip, parent_ip, flags, pc); +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); local_irq_restore(flags); - out: atomic_dec(&data->disabled); - out_enable: preempt_enable_notrace(); } @@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc, ret = 0; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return 0; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore, /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; - trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); T0 = data->preempt_timestamp; @@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: arch_spin_unlock(&wakeup_lock); @@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - register_ftrace_function(&trace_ops); - - if (tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); return; fail_deprobe_wake_new: @@ -320,7 +516,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + stop_func_tracer(is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; @@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc3793baf6..209b379a4721 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) { int ret, cpu; + for_each_possible_cpu(cpu) { + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); + } + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); if (ret) goto out; @@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) if (ret) goto no_creation; - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - return 0; no_creation: diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c77f3eceea25..e95ee7f31d43 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,6 +25,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/jump_label.h> extern struct tracepoint __start___tracepoints[]; extern struct tracepoint __stop___tracepoints[]; @@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - elem->state = active; + if (!elem->state && active) { + jump_label_enable(&elem->state); + elem->state = active; + } else if (elem->state && !active) { + jump_label_disable(&elem->state); + elem->state = active; + } } /* @@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem) if (elem->unregfunc && elem->state) elem->unregfunc(); - elem->state = 0; + if (elem->state) { + jump_label_disable(&elem->state); + elem->state = 0; + } rcu_assign_pointer(elem->funcs, NULL); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f9c3c52ecc1..bafba687a6d8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int __read_mostly did_panic; static int __initdata no_watchdog; @@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static int -watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = watchdog_panic, -}; - #ifdef CONFIG_HARDLOCKUP_DETECTOR static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -209,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = { }; /* Callback function for perf event subsystem */ -void watchdog_overflow_callback(struct perf_event *event, int nmi, +static void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { @@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(); - event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); goto out_save; } printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); - return -1; + return PTR_ERR(event); /* success path */ out_save: @@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + int err; /* enable the perf event */ - if (watchdog_nmi_enable(cpu) != 0) - return -1; + err = watchdog_nmi_enable(cpu); + if (err) + return err; /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return -1; + return PTR_ERR(p); } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void) { int cpu; + if (no_watchdog) + return; + for_each_online_cpu(cpu) watchdog_disable(cpu); @@ -526,17 +518,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; + int err = 0; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (watchdog_prepare_cpu(hotcpu)) - return NOTIFY_BAD; + err = watchdog_prepare_cpu(hotcpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (watchdog_enable(hotcpu)) - return NOTIFY_BAD; + err = watchdog_enable(hotcpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: @@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __cpuinitdata cpu_nfb = { @@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void) return 0; err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(err == NOTIFY_BAD); + WARN_ON(notifier_to_errno(err)); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - return 0; } early_initcall(spawn_watchdog_task); |