From 23fb064bb96f001ecb8682129f7ee1bc1ca691bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Jul 2009 21:18:35 +0900 Subject: percpu: kill legacy percpu allocator With ia64 converted, there's no arch left which still uses legacy percpu allocator. Kill it. Signed-off-by: Tejun Heo Delightedly-acked-by: Rusty Russell Cc: Ingo Molnar Cc: Christoph Lameter --- kernel/module.c | 150 -------------------------------------------------------- 1 file changed, 150 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8b7d8805819d..64787cddeb5e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA - static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ - -/* Number of blocks used and allocated. */ -static unsigned int pcpu_num_used, pcpu_num_allocated; -/* Size of each block. -ve means used. */ -static int *pcpu_size; - -static int split_block(unsigned int i, unsigned short size) -{ - /* Reallocation required? */ - if (pcpu_num_used + 1 > pcpu_num_allocated) { - int *new; - - new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2, - GFP_KERNEL); - if (!new) - return 0; - - pcpu_num_allocated *= 2; - pcpu_size = new; - } - - /* Insert a new subblock */ - memmove(&pcpu_size[i+1], &pcpu_size[i], - sizeof(pcpu_size[0]) * (pcpu_num_used - i)); - pcpu_num_used++; - - pcpu_size[i+1] -= size; - pcpu_size[i] = size; - return 1; -} - -static inline unsigned int block_size(int val) -{ - if (val < 0) - return -val; - return val; -} - -static void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) -{ - unsigned long extra; - unsigned int i; - void *ptr; - int cpu; - - if (align > PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); - align = PAGE_SIZE; - } - - ptr = __per_cpu_start; - for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { - /* Extra for alignment requirement. */ - extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr; - BUG_ON(i == 0 && extra != 0); - - if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size) - continue; - - /* Transfer extra to previous block. */ - if (pcpu_size[i-1] < 0) - pcpu_size[i-1] -= extra; - else - pcpu_size[i-1] += extra; - pcpu_size[i] -= extra; - ptr += extra; - - /* Split block if warranted */ - if (pcpu_size[i] - size > sizeof(unsigned long)) - if (!split_block(i, size)) - return NULL; - - /* add the per-cpu scanning areas */ - for_each_possible_cpu(cpu) - kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0, - GFP_KERNEL); - - /* Mark allocated */ - pcpu_size[i] = -pcpu_size[i]; - return ptr; - } - - printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", - size); - return NULL; -} - -static void percpu_modfree(void *freeme) -{ - unsigned int i; - void *ptr = __per_cpu_start + block_size(pcpu_size[0]); - int cpu; - - /* First entry is core kernel percpu data. */ - for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { - if (ptr == freeme) { - pcpu_size[i] = -pcpu_size[i]; - goto free; - } - } - BUG(); - - free: - /* remove the per-cpu scanning areas */ - for_each_possible_cpu(cpu) - kmemleak_free(freeme + per_cpu_offset(cpu)); - - /* Merge with previous? */ - if (pcpu_size[i-1] >= 0) { - pcpu_size[i-1] += pcpu_size[i]; - pcpu_num_used--; - memmove(&pcpu_size[i], &pcpu_size[i+1], - (pcpu_num_used - i) * sizeof(pcpu_size[0])); - i--; - } - /* Merge with next? */ - if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) { - pcpu_size[i] += pcpu_size[i+1]; - pcpu_num_used--; - memmove(&pcpu_size[i+1], &pcpu_size[i+2], - (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0])); - } -} - -static int percpu_modinit(void) -{ - pcpu_num_used = 2; - pcpu_num_allocated = 2; - pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, - GFP_KERNEL); - /* Static in-kernel percpu data (used). */ - pcpu_size[0] = -(__per_cpu_end-__per_cpu_start); - /* Free room. */ - pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; - if (pcpu_size[1] < 0) { - printk(KERN_ERR "No per-cpu room for modules.\n"); - pcpu_num_used = 1; - } - - return 0; -} -__initcall(percpu_modinit); - -#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ - static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) -- cgit v1.2.1 From e800879d50c5a528d40191528557b1bdfbccbd42 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 3 Oct 2009 19:48:23 +0900 Subject: this_cpu: Use this_cpu operations in RCU RCU does not do dynamic allocations but it increments per cpu variables a lot. These instructions results in a move to a register and then back to memory. This patch will make it use the inc/dec instructions on x86 that do not need a register. Acked-by: Tejun Heo Acked-by: Paul E. McKenney Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/rcutorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 233768f21f97..178967b6434e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -731,13 +731,13 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; + __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_batch)[completed]; + __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); preempt_enable(); cur_ops->readunlock(idx); } @@ -786,13 +786,13 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; + __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_batch)[completed]; + __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); preempt_enable(); cur_ops->readunlock(idx); schedule(); -- cgit v1.2.1 From 9288f99aa52d90a5b82573c4b769c97c55af2f56 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 7 Oct 2009 19:17:45 -0400 Subject: this_cpu: Use this_cpu_xx for ftrace this_cpu_xx can reduce the instruction count here and also avoid address arithmetic. Signed-off-by: Christoph Lameter Acked-by: Ingo Molnar Signed-off-by: Tejun Heo --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 45068269ebb1..8439cdcada94 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static int tracing_disabled = 1; -DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); +DEFINE_PER_CPU(int, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { preempt_disable(); - local_inc(&__get_cpu_var(ftrace_cpu_disabled)); + __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); } static inline void ftrace_enable_cpu(void) { - local_dec(&__get_cpu_var(ftrace_cpu_disabled)); + __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); preempt_enable(); } @@ -1085,7 +1085,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry; /* If we are reading the ring buffer, don't trace */ - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return; event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 405cb850b75d..542f45554883 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -413,7 +413,7 @@ extern int DYN_FTRACE_TEST_NAME(void); extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; -DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); +DECLARE_PER_CPU(int, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, -- cgit v1.2.1 From dec54bf538326a1503dd780c9f2811f495af95c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Oct 2009 23:23:02 +0900 Subject: this_cpu: Use this_cpu_xx in trace_functions_graph.c ftrace_cpu_disabled usage in trace_functions_graph.c were left out during this_cpu_xx conversion in commit 9288f99a causing compile failure. Convert them. Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell Cc: Christoph Lameter --- kernel/trace/trace_functions_graph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 45e6c01b2e4d..90a6daa10962 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -176,7 +176,7 @@ static int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return 0; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -240,7 +240,7 @@ static void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, -- cgit v1.2.1 From c017b4be3e84176cab10eca5e6c4faeb8cfc6f3e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 28 Oct 2009 13:33:09 +0000 Subject: kmemleak: Simplify the kmemleak_scan_area() function prototype This function was taking non-necessary arguments which can be determined by kmemleak. The patch also modifies the calling sites. Signed-off-by: Catalin Marinas Cc: Pekka Enberg Cc: Christoph Lameter Cc: Rusty Russell --- kernel/module.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8b7d8805819d..1eb952097077 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2043,9 +2043,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, unsigned int i; /* only scan the sections containing data */ - kmemleak_scan_area(mod->module_core, (unsigned long)mod - - (unsigned long)mod->module_core, - sizeof(struct module), GFP_KERNEL); + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); for (i = 1; i < hdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -2054,8 +2052,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) continue; - kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - - (unsigned long)mod->module_core, + kmemleak_scan_area((void *)sechdrs[i].sh_addr, sechdrs[i].sh_size, GFP_KERNEL); } } -- cgit v1.2.1 From a6f5aa1ea05686ad6e84593a00a04161e6dfb3a3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 28 Oct 2009 13:33:10 +0000 Subject: kmemleak: Scan the _ftrace_events section in modules This section contains pointers to allocated objects and not scanning it leads to false positives. Reported-by: Zdenek Kabelac Acked-by: Rusty Russell Signed-off-by: Catalin Marinas --- kernel/module.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1eb952097077..dd29ba43c34f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2380,6 +2380,12 @@ static noinline struct module *load_module(void __user *umod, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * + mod->num_trace_events, GFP_KERNEL); #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ -- cgit v1.2.1 From 1871e52c76dd95895caeb772f845a1718dcbcd75 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:13 +0900 Subject: percpu: make percpu symbols under kernel/ and mm/ unique This patch updates percpu related symbols under kernel/ and mm/ such that percpu symbols are unique and don't clash with local symbols. This serves two purposes of decreasing the possibility of global percpu symbol collision and allowing dropping per_cpu__ prefix from percpu symbols. * kernel/lockdep.c: s/lock_stats/cpu_lock_stats/ * kernel/sched.c: s/init_rq_rt/init_rt_rq_var/ (any better idea?) s/sched_group_cpus/sched_groups/ * kernel/softirq.c: s/ksoftirqd/run_ksoftirqd/a * kernel/softlockup.c: s/(*)_timestamp/softlockup_\1_ts/ s/watchdog_task/softlockup_watchdog/ s/timestamp/ts/ for local variables * kernel/time/timer_stats: s/lookup_lock/tstats_lookup_lock/ * mm/slab.c: s/reap_work/slab_reap_work/ s/reap_node/slab_reap_node/ * mm/vmstat.c: local variable changed to avoid collision with vmstat_work Partly based on Rusty Russell's "alloc_percpu: rename percpu vars which cause name clashes" patch. Signed-off-by: Tejun Heo Acked-by: (slab/vmstat) Christoph Lameter Reviewed-by: Christoph Lameter Cc: Rusty Russell Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andrew Morton Cc: Nick Piggin --- kernel/lockdep.c | 11 +++++----- kernel/sched.c | 8 +++---- kernel/softirq.c | 4 ++-- kernel/softlockup.c | 54 +++++++++++++++++++++++------------------------ kernel/time/timer_stats.c | 11 +++++----- 5 files changed, 45 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3815ac1d58b2..8631320a50d0 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -140,7 +140,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], + cpu_lock_stats); static int lock_point(unsigned long points[], unsigned long ip) { @@ -186,7 +187,7 @@ struct lock_class_stats lock_stats(struct lock_class *class) memset(&stats, 0, sizeof(struct lock_class_stats)); for_each_possible_cpu(cpu) { struct lock_class_stats *pcs = - &per_cpu(lock_stats, cpu)[class - lock_classes]; + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) stats.contention_point[i] += pcs->contention_point[i]; @@ -213,7 +214,7 @@ void clear_lock_stats(struct lock_class *class) for_each_possible_cpu(cpu) { struct lock_class_stats *cpu_stats = - &per_cpu(lock_stats, cpu)[class - lock_classes]; + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; memset(cpu_stats, 0, sizeof(struct lock_class_stats)); } @@ -223,12 +224,12 @@ void clear_lock_stats(struct lock_class *class) static struct lock_class_stats *get_lock_stats(struct lock_class *class) { - return &get_cpu_var(lock_stats)[class - lock_classes]; + return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; } static void put_lock_stats(struct lock_class_stats *stats) { - put_cpu_var(lock_stats); + put_cpu_var(cpu_lock_stats); } static void lock_release_holdtime(struct held_lock *hlock) diff --git a/kernel/sched.c b/kernel/sched.c index 1535f3884b88..854ab418fd42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group @@ -8199,14 +8199,14 @@ enum s_alloc { */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_group, sched_groups); static int cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu).sg; + *sg = &per_cpu(sched_groups, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@ -9470,7 +9470,7 @@ void __init sched_init(void) #elif defined CONFIG_USER_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); init_tg_rt_entry(&init_task_group, - &per_cpu(init_rt_rq, i), + &per_cpu(init_rt_rq_var, i), &per_cpu(init_sched_rt_entity, i), i, 1, root_task_group.rt_se[i]); #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index f8749e5216e0..0740dfd55c51 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -697,7 +697,7 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int ksoftirqd(void * __bind_cpu) +static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); @@ -810,7 +810,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return NOTIFY_BAD; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 81324d12eb35..d22579087e27 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -22,9 +22,9 @@ static DEFINE_SPINLOCK(print_lock); -static DEFINE_PER_CPU(unsigned long, touch_timestamp); -static DEFINE_PER_CPU(unsigned long, print_timestamp); -static DEFINE_PER_CPU(struct task_struct *, watchdog_task); +static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ +static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -70,12 +70,12 @@ static void __touch_softlockup_watchdog(void) { int this_cpu = raw_smp_processor_id(); - __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); + __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); } void touch_softlockup_watchdog(void) { - __raw_get_cpu_var(touch_timestamp) = 0; + __raw_get_cpu_var(softlockup_touch_ts) = 0; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -85,7 +85,7 @@ void touch_all_softlockup_watchdogs(void) /* Cause each CPU to re-update its timestamp rather than complain */ for_each_online_cpu(cpu) - per_cpu(touch_timestamp, cpu) = 0; + per_cpu(softlockup_touch_ts, cpu) = 0; } EXPORT_SYMBOL(touch_all_softlockup_watchdogs); @@ -104,28 +104,28 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write, void softlockup_tick(void) { int this_cpu = smp_processor_id(); - unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); - unsigned long print_timestamp; + unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); + unsigned long print_ts; struct pt_regs *regs = get_irq_regs(); unsigned long now; /* Is detection switched off? */ - if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { + if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { /* Be sure we don't false trigger if switched back on */ - if (touch_timestamp) - per_cpu(touch_timestamp, this_cpu) = 0; + if (touch_ts) + per_cpu(softlockup_touch_ts, this_cpu) = 0; return; } - if (touch_timestamp == 0) { + if (touch_ts == 0) { __touch_softlockup_watchdog(); return; } - print_timestamp = per_cpu(print_timestamp, this_cpu); + print_ts = per_cpu(softlockup_print_ts, this_cpu); /* report at most once a second */ - if (print_timestamp == touch_timestamp || did_panic) + if (print_ts == touch_ts || did_panic) return; /* do not print during early bootup: */ @@ -140,18 +140,18 @@ void softlockup_tick(void) * Wake up the high-prio watchdog task twice per * threshold timespan. */ - if (now > touch_timestamp + softlockup_thresh/2) - wake_up_process(per_cpu(watchdog_task, this_cpu)); + if (now > touch_ts + softlockup_thresh/2) + wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); /* Warn about unreasonable delays: */ - if (now <= (touch_timestamp + softlockup_thresh)) + if (now <= (touch_ts + softlockup_thresh)) return; - per_cpu(print_timestamp, this_cpu) = touch_timestamp; + per_cpu(softlockup_print_ts, this_cpu) = touch_ts; spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", - this_cpu, now - touch_timestamp, + this_cpu, now - touch_ts, current->comm, task_pid_nr(current)); print_modules(); print_irqtrace_events(current); @@ -209,32 +209,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - BUG_ON(per_cpu(watchdog_task, hotcpu)); + BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); if (IS_ERR(p)) { printk(KERN_ERR "watchdog for %i failed\n", hotcpu); return NOTIFY_BAD; } - per_cpu(touch_timestamp, hotcpu) = 0; - per_cpu(watchdog_task, hotcpu) = p; + per_cpu(softlockup_touch_ts, hotcpu) = 0; + per_cpu(softlockup_watchdog, hotcpu) = p; kthread_bind(p, hotcpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(watchdog_task, hotcpu)); + wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(watchdog_task, hotcpu)) + if (!per_cpu(softlockup_watchdog, hotcpu)) break; /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(watchdog_task, hotcpu), + kthread_bind(per_cpu(softlockup_watchdog, hotcpu), cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: - p = per_cpu(watchdog_task, hotcpu); - per_cpu(watchdog_task, hotcpu) = NULL; + p = per_cpu(softlockup_watchdog, hotcpu); + per_cpu(softlockup_watchdog, hotcpu) = NULL; kthread_stop(p); break; #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index ee5681f8d7ec..63b117e9eba1 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: */ -static DEFINE_PER_CPU(spinlock_t, lookup_lock); +static DEFINE_PER_CPU(spinlock_t, tstats_lookup_lock); /* * Mutex to serialize state changes with show-stats activities: @@ -245,7 +245,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, if (likely(!timer_stats_active)) return; - lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id()); input.timer = timer; input.start_func = startf; @@ -348,9 +348,10 @@ static void sync_access(void) int cpu; for_each_online_cpu(cpu) { - spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); + spin_lock_irqsave(lock, flags); /* nothing */ - spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + spin_unlock_irqrestore(lock, flags); } } @@ -408,7 +409,7 @@ void __init init_timer_stats(void) int cpu; for_each_possible_cpu(cpu) - spin_lock_init(&per_cpu(lookup_lock, cpu)); + spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); } static int __init init_tstats_procfs(void) -- cgit v1.2.1 From 9705f69ed0a5ef593f45e618bcb3cbfdbf391f64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:13 +0900 Subject: percpu: make percpu symbols in tracer unique This patch updates percpu related symbols in kernel tracer such that percpu symbols are unique and don't clash with local symbols. This serves two purposes of decreasing the possibility of global percpu symbol collision and allowing dropping per_cpu__ prefix from percpu symbols. * kernel/trace/trace.c: s/max_data/max_tr_data/ * kernel/trace/trace_hw_branches: s/tracer/hwb_tracer/, s/buffer/hwb_buffer/ Partly based on Rusty Russell's "alloc_percpu: rename percpu vars which cause name clashes" patch. Signed-off-by: Tejun Heo Acked-by: Steven Rostedt Cc: Rusty Russell Cc: Frederic Weisbecker Cc: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_hw_branches.c | 51 ++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8439cdcada94..85a5ed70b5b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu) */ static struct trace_array max_tr; -static DEFINE_PER_CPU(struct trace_array_cpu, max_data); +static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; @@ -4426,7 +4426,7 @@ __init static int tracer_alloc_buffers(void) /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_data, i); + max_tr.data[i] = &per_cpu(max_tr_data, i); } trace_init_cmdlines(); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 23b63859130e..adaf7a39d0dc 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -20,10 +20,10 @@ #define BTS_BUFFER_SIZE (1 << 13) -static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); +static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer); -#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_tracer per_cpu(hwb_tracer, smp_processor_id()) static int trace_hw_branches_enabled __read_mostly; static int trace_hw_branches_suspended __read_mostly; @@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly; static void bts_trace_init_cpu(int cpu) { - per_cpu(tracer, cpu) = - ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); + per_cpu(hwb_tracer, cpu) = + ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu), + BTS_BUFFER_SIZE, NULL, (size_t)-1, + BTS_KERNEL); - if (IS_ERR(per_cpu(tracer, cpu))) - per_cpu(tracer, cpu) = NULL; + if (IS_ERR(per_cpu(hwb_tracer, cpu))) + per_cpu(hwb_tracer, cpu) = NULL; } static int bts_trace_init(struct trace_array *tr) @@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr) for_each_online_cpu(cpu) { bts_trace_init_cpu(cpu); - if (likely(per_cpu(tracer, cpu))) + if (likely(per_cpu(hwb_tracer, cpu))) trace_hw_branches_enabled = 1; } trace_hw_branches_suspended = 0; @@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) { - if (likely(per_cpu(tracer, cpu))) { - ds_release_bts(per_cpu(tracer, cpu)); - per_cpu(tracer, cpu) = NULL; + if (likely(per_cpu(hwb_tracer, cpu))) { + ds_release_bts(per_cpu(hwb_tracer, cpu)); + per_cpu(hwb_tracer, cpu) = NULL; } } trace_hw_branches_enabled = 0; @@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_resume_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_resume_bts(per_cpu(hwb_tracer, cpu)); trace_hw_branches_suspended = 0; put_online_cpus(); } @@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); trace_hw_branches_suspended = 1; put_online_cpus(); } @@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, bts_trace_init_cpu(cpu); if (trace_hw_branches_suspended && - likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); } break; case CPU_DOWN_PREPARE: /* The notification is sent with interrupts enabled. */ - if (likely(per_cpu(tracer, cpu))) { - ds_release_bts(per_cpu(tracer, cpu)); - per_cpu(tracer, cpu) = NULL; + if (likely(per_cpu(hwb_tracer, cpu))) { + ds_release_bts(per_cpu(hwb_tracer, cpu)); + per_cpu(hwb_tracer, cpu) = NULL; } } @@ -256,8 +257,8 @@ static void trace_bts_prepare(struct trace_iterator *iter) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); /* * We need to collect the trace on the respective cpu since ftrace * implicitly adds the record for the current cpu. @@ -266,8 +267,8 @@ static void trace_bts_prepare(struct trace_iterator *iter) on_each_cpu(trace_bts_cpu, iter->tr, 1); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_resume_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_resume_bts(per_cpu(hwb_tracer, cpu)); put_online_cpus(); } -- cgit v1.2.1 From 1e5ad9679016275d422e36b12a98b0927d76f556 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 2 Nov 2009 10:45:36 -0700 Subject: resources: when allocate_resource() fails, leave resource untouched When "allocate_resource(root, new, size, ...)" fails, we currently clobber "new". This is inconvenient for the caller, who might care about the original contents of the resource. For example, when pci_bus_alloc_resource() fails, the "can't allocate mem resource %pR" message from pci_assign_resources() currently contains junk for the resource start/end. This patch delays the "new" update until we're about to return success. Acked-by: Linus Torvalds Acked-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index fb11a58b9594..dc15686b7a77 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; + resource_size_t start, end; - new->start = root->start; + start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment * of this->start - 1 to new->end below would cause an underflow. */ if (this && this->start == 0) { - new->start = this->end + 1; + start = this->end + 1; this = this->sibling; } for(;;) { if (this) - new->end = this->start - 1; + end = this->start - 1; else - new->end = root->end; - if (new->start < min) - new->start = min; - if (new->end > max) - new->end = max; - new->start = ALIGN(new->start, align); + end = root->end; + if (start < min) + start = min; + if (end > max) + end = max; + start = ALIGN(start, align); if (alignf) alignf(alignf_data, new, size, align); - if (new->start < new->end && new->end - new->start >= size - 1) { - new->end = new->start + size - 1; + if (start < end && end - start >= size - 1) { + new->start = start; + new->end = start + size - 1; return 0; } if (!this) break; - new->start = this->end + 1; + start = this->end + 1; this = this->sibling; } return -EBUSY; -- cgit v1.2.1 From dc186ad741c12ae9ecac8b89e317ef706fdaf8f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2009 01:09:48 +0900 Subject: workqueue: Add debugobjects support Add debugobject support to track the life time of work_structs. While at it, remove duplicate definition of INIT_DELAYED_WORK_ON_STACK(). Signed-off-by: Thomas Gleixner Signed-off-by: Tejun Heo --- kernel/workqueue.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 128 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 12328147132c..ddad63fbb61b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,116 @@ struct workqueue_struct { #endif }; +#ifdef CONFIG_DEBUG_OBJECTS_WORK + +static struct debug_obj_descr work_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int work_fixup_init(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_init(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int work_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The work struct was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { + debug_object_init(work, &work_debug_descr); + debug_object_activate(work, &work_debug_descr); + return 0; + } + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int work_fixup_free(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_free(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr work_debug_descr = { + .name = "work_struct", + .fixup_init = work_fixup_init, + .fixup_activate = work_fixup_activate, + .fixup_free = work_fixup_free, +}; + +static inline void debug_work_activate(struct work_struct *work) +{ + debug_object_activate(work, &work_debug_descr); +} + +static inline void debug_work_deactivate(struct work_struct *work) +{ + debug_object_deactivate(work, &work_debug_descr); +} + +void __init_work(struct work_struct *work, int onstack) +{ + if (onstack) + debug_object_init_on_stack(work, &work_debug_descr); + else + debug_object_init(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(__init_work); + +void destroy_work_on_stack(struct work_struct *work) +{ + debug_object_free(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_work_on_stack); + +#else +static inline void debug_work_activate(struct work_struct *work) { } +static inline void debug_work_deactivate(struct work_struct *work) { } +#endif + /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); @@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, { unsigned long flags; + debug_work_activate(work); spin_lock_irqsave(&cwq->lock, flags); insert_work(cwq, work, &cwq->worklist); spin_unlock_irqrestore(&cwq->lock, flags); @@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) struct lockdep_map lockdep_map = work->lockdep_map; #endif trace_workqueue_execution(cwq->thread, work); + debug_work_deactivate(work); cwq->current_work = work; list_del_init(cwq->worklist.next); spin_unlock_irq(&cwq->lock); @@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work) static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, struct wq_barrier *barr, struct list_head *head) { - INIT_WORK(&barr->work, wq_barrier_func); + /* + * debugobject calls are safe here even with cwq->lock locked + * as we know for sure that this will not trigger any of the + * checks and call back into the fixup functions where we + * might deadlock. + */ + INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); init_completion(&barr->done); + debug_work_activate(&barr->work); insert_work(cwq, &barr->work, head); } @@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } spin_unlock_irq(&cwq->lock); - if (active) + if (active) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } return active; } @@ -451,6 +572,7 @@ out: return 0; wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); return 1; } EXPORT_SYMBOL_GPL(flush_work); @@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work) */ smp_rmb(); if (cwq == get_wq_data(work)) { + debug_work_deactivate(work); list_del_init(&work->entry); ret = 1; } @@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, } spin_unlock_irq(&cwq->lock); - if (unlikely(running)) + if (unlikely(running)) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } } static void wait_on_work(struct work_struct *work) -- cgit v1.2.1 From 456b565cc52fbcdaa2e19ffdf40d9dd3b726d603 Mon Sep 17 00:00:00 2001 From: Simon Kagstrom Date: Fri, 16 Oct 2009 14:09:18 +0200 Subject: core: Add kernel message dumper to call on oopses and panics The core functionality is implemented as per Linus suggestion from http://lists.infradead.org/pipermail/linux-mtd/2009-October/027620.html (with the kmsg_dump implementation by Linus). A struct kmsg_dumper has been added which contains a callback to dump the kernel log buffers on crashes. The kmsg_dump function gets called from oops_exit() and panic() and invokes this callbacks with the crash reason. [dwmw2: Fix log_end handling] Signed-off-by: Simon Kagstrom Reviewed-by: Anders Grafstrom Reviewed-by: Linus Torvalds Acked-by: Ingo Molnar Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- kernel/panic.c | 3 ++ kernel/printk.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index bcdef26e3332..8c43226a544d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -74,6 +75,7 @@ NORET_TYPE void panic(const char * fmt, ...) dump_stack(); #endif + kmsg_dump(KMSG_DUMP_PANIC); /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @@ -338,6 +340,7 @@ void oops_exit(void) { do_oops_enter_exit(); print_oops_end_marker(); + kmsg_dump(KMSG_DUMP_OOPS); } #ifdef WANT_WARN_ON_SLOWPATH diff --git a/kernel/printk.c b/kernel/printk.c index f38b07f78a4e..051d1f50648f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -1405,3 +1406,121 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies, } EXPORT_SYMBOL(printk_timed_ratelimit); #endif + +static DEFINE_SPINLOCK(dump_list_lock); +static LIST_HEAD(dump_list); + +/** + * kmsg_dump_register - register a kernel log dumper. + * @dump: pointer to the kmsg_dumper structure + * + * Adds a kernel log dumper to the system. The dump callback in the + * structure will be called when the kernel oopses or panics and must be + * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. + */ +int kmsg_dump_register(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EBUSY; + + /* The dump callback needs to be set */ + if (!dumper->dump) + return -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + /* Don't allow registering multiple times */ + if (!dumper->registered) { + dumper->registered = 1; + list_add_tail(&dumper->list, &dump_list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_register); + +/** + * kmsg_dump_unregister - unregister a kmsg dumper. + * @dump: pointer to the kmsg_dumper structure + * + * Removes a dump device from the system. Returns zero on success and + * %-EINVAL otherwise. + */ +int kmsg_dump_unregister(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + if (dumper->registered) { + dumper->registered = 0; + list_del(&dumper->list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_unregister); + +static const char const *kmsg_reasons[] = { + [KMSG_DUMP_OOPS] = "oops", + [KMSG_DUMP_PANIC] = "panic", +}; + +static const char *kmsg_to_str(enum kmsg_dump_reason reason) +{ + if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) + return "unknown"; + + return kmsg_reasons[reason]; +} + +/** + * kmsg_dump - dump kernel log to kernel message dumpers. + * @reason: the reason (oops, panic etc) for dumping + * + * Iterate through each of the dump devices and call the oops/panic + * callbacks with the log buffer. + */ +void kmsg_dump(enum kmsg_dump_reason reason) +{ + unsigned long end; + unsigned chars; + struct kmsg_dumper *dumper; + const char *s1, *s2; + unsigned long l1, l2; + unsigned long flags; + + /* Theoretically, the log could move on after we do this, but + there's not a lot we can do about that. The new messages + will overwrite the start of what we dump. */ + spin_lock_irqsave(&logbuf_lock, flags); + end = log_end & LOG_BUF_MASK; + chars = logged_chars; + spin_unlock_irqrestore(&logbuf_lock, flags); + + if (logged_chars > end) { + s1 = log_buf + log_buf_len - logged_chars + end; + l1 = logged_chars - end; + + s2 = log_buf; + l2 = end; + } else { + s1 = ""; + l1 = 0; + + s2 = log_buf + end - logged_chars; + l2 = logged_chars; + } + + if (!spin_trylock_irqsave(&dump_list_lock, flags)) { + printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", + kmsg_to_str(reason)); + return; + } + list_for_each_entry(dumper, &dump_list, list) + dumper->dump(dumper, reason, s1, l1, s2, l2); + spin_unlock_irqrestore(&dump_list_lock, flags); +} -- cgit v1.2.1 From 595dd3d8bf953254d8d2f30f99c54fe09c470040 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 1 Dec 2009 10:52:02 -0800 Subject: kmsg_dump: fix build for CONFIG_PRINTK=n kmsg_dump() fails to build when CONFIG_PRINTK=n; provide stubs for the kmsg_dump*() functions when CONFIG_PRINTK=n. kernel/printk.c: In function 'kmsg_dump': kernel/printk.c:1501: error: 'log_buf_len' undeclared (first use in this function) kernel/printk.c:1502: error: 'logged_chars' undeclared (first use in this function) kernel/printk.c:1506: error: 'log_buf' undeclared (first use in this function) Signed-off-by: Randy Dunlap Acked-by: Simon Kagstrom Signed-off-by: David Woodhouse --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 051d1f50648f..2a564570f822 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1405,7 +1405,6 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } EXPORT_SYMBOL(printk_timed_ratelimit); -#endif static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); @@ -1524,3 +1523,4 @@ void kmsg_dump(enum kmsg_dump_reason reason) dumper->dump(dumper, reason, s1, l1, s2, l2); spin_unlock_irqrestore(&dump_list_lock, flags); } +#endif -- cgit v1.2.1 From 2f0993e0fb663c49e4d1e02654f6203246be4817 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 07:06:10 +0100 Subject: hw-breakpoints: Drop callback and task parameters from modify helper Drop the callback and task parameters from modify_user_hw_breakpoint(). For now we have no user that need to modify a breakpoint to the point of changing its handler or its task context. Signed-off-by: Frederic Weisbecker Cc: "K. Prasad" --- kernel/hw_breakpoint.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index cf5ee1628411..2d10b012828f 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -312,9 +312,7 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); * @tsk: pointer to 'task_struct' of the process to which the address belongs */ struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, - perf_callback_t triggered, - struct task_struct *tsk) +modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { /* * FIXME: do it without unregistering @@ -323,7 +321,8 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, */ unregister_hw_breakpoint(bp); - return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); + return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, + bp->callback); } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); -- cgit v1.2.1 From b326e9560a28fc3e950637ef51847ed8f05c1335 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 09:44:31 +0100 Subject: hw-breakpoints: Use overflow handler instead of the event callback struct perf_event::event callback was called when a breakpoint triggers. But this is a rather opaque callback, pretty tied-only to the breakpoint API and not really integrated into perf as it triggers even when we don't overflow. We prefer to use overflow_handler() as it fits into the perf events rules, being called only when we overflow. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- kernel/hw_breakpoint.c | 17 +++++------------ kernel/perf_event.c | 24 +++++++++--------------- kernel/trace/trace_ksym.c | 5 +++-- 3 files changed, 17 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 2d10b012828f..b600fc27f161 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -259,7 +259,7 @@ void release_bp_slot(struct perf_event *bp) } -int __register_perf_hw_breakpoint(struct perf_event *bp) +int register_perf_hw_breakpoint(struct perf_event *bp) { int ret; @@ -276,19 +276,12 @@ int __register_perf_hw_breakpoint(struct perf_event *bp) * This is a quick hack that will be removed soon, once we remove * the tmp breakpoints from ptrace */ - if (!bp->attr.disabled || bp->callback == perf_bp_event) + if (!bp->attr.disabled || !bp->overflow_handler) ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); return ret; } -int register_perf_hw_breakpoint(struct perf_event *bp) -{ - bp->callback = perf_bp_event; - - return __register_perf_hw_breakpoint(bp); -} - /** * register_user_hw_breakpoint - register a hardware breakpoint for user space * @attr: breakpoint attributes @@ -297,7 +290,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) */ struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, struct task_struct *tsk) { return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); @@ -322,7 +315,7 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) unregister_hw_breakpoint(bp); return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, - bp->callback); + bp->overflow_handler); } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); @@ -347,7 +340,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); */ struct perf_event ** register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered) + perf_overflow_handler_t triggered) { struct perf_event **cpu_events, **pevent, *bp; long err; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 6b7ddba1dd64..fd43ff4ac860 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4286,15 +4286,8 @@ static void bp_perf_event_destroy(struct perf_event *event) static const struct pmu *bp_perf_event_init(struct perf_event *bp) { int err; - /* - * The breakpoint is already filled if we haven't created the counter - * through perf syscall - * FIXME: manage to get trigerred to NULL if it comes from syscalls - */ - if (!bp->callback) - err = register_perf_hw_breakpoint(bp); - else - err = __register_perf_hw_breakpoint(bp); + + err = register_perf_hw_breakpoint(bp); if (err) return ERR_PTR(err); @@ -4390,7 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, - perf_callback_t callback, + perf_overflow_handler_t overflow_handler, gfp_t gfpflags) { const struct pmu *pmu; @@ -4433,10 +4426,10 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; - if (!callback && parent_event) - callback = parent_event->callback; + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; - event->callback = callback; + event->overflow_handler = overflow_handler; if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4776,7 +4769,8 @@ err_put_context: */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, perf_callback_t callback) + pid_t pid, + perf_overflow_handler_t overflow_handler) { struct perf_event *event; struct perf_event_context *ctx; @@ -4793,7 +4787,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } event = perf_event_alloc(attr, cpu, ctx, NULL, - NULL, callback, GFP_KERNEL); + NULL, overflow_handler, GFP_KERNEL); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_put_context; diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index ddfa0fd43bc0..acb87d4a4ac1 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) } #endif /* CONFIG_PROFILE_KSYM_TRACER */ -void ksym_hbp_handler(struct perf_event *hbp, void *data) +void ksym_hbp_handler(struct perf_event *hbp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { struct ring_buffer_event *event; struct ksym_trace_entry *entry; - struct pt_regs *regs = data; struct ring_buffer *buffer; int pc; -- cgit v1.2.1 From 109d71c6dd52ec08878c2c67eb4c0bd67fcbc80b Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Thu, 19 Nov 2009 13:42:06 -0800 Subject: lockstat: Fix min, max times in /proc/lock_stats Fix min, max times in /proc/lock_stats (1) When collecting lock hold and wait times, if the current minimum time is zero, it will be replaced by the next time. (2) When aggregating minimum and maximum lock hold and wait times accross cpus, the values are added, instead of selecting the minimum and maximum. Signed-off-by: Frank Rowand Signed-off-by: Peter Zijlstra LKML-Reference: <4B05BBAE.2050005@am.sony.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f5dcd36d3151..7a3ae56b3a7f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -168,7 +168,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time) if (time > lt->max) lt->max = time; - if (time < lt->min || !lt->min) + if (time < lt->min || !lt->nr) lt->min = time; lt->total += time; @@ -177,8 +177,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time) static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) { - dst->min += src->min; - dst->max += src->max; + if (!src->nr) + return; + + if (src->max > dst->max) + dst->max = src->max; + + if (src->min < dst->min || !dst->nr) + dst->min = src->min; + dst->total += src->total; dst->nr += src->nr; } -- cgit v1.2.1 From e1b8090bdf125f8b2e192149547fead7f302a89c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 6 Dec 2009 20:41:16 +0100 Subject: cpumask: Fix generate_sched_domains() for UP Commit acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 ("cpumask: Partition_sched_domains takes array of cpumask_var_t") changed the function signature of generate_sched_domains() for the CONFIG_SMP=y case, but forgot to update the corresponding function for the CONFIG_SMP=n case, causing: kernel/cpuset.c:2073: warning: passing argument 1 of 'generate_sched_domains' from incompatible pointer type Signed-off-by: Geert Uytterhoeven Cc: Rusty Russell Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3cf2183b472d..43fb7e800028 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) { } -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { *domains = NULL; -- cgit v1.2.1 From 6ad4c18884e864cf4c77f9074d3d1816063f99cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Nov 2009 13:31:39 +0100 Subject: sched: Fix balance vs hotplug race Since (e761b77: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment) we have cpu_active_mask which is suppose to rule scheduler migration and load-balancing, except it never (fully) did. The particular problem being solved here is a crash in try_to_wake_up() where select_task_rq() ends up selecting an offline cpu because select_task_rq_fair() trusts the sched_domain tree to reflect the current state of affairs, similarly select_task_rq_rt() trusts the root_domain. However, the sched_domains are updated from CPU_DEAD, which is after the cpu is taken offline and after stop_machine is done. Therefore it can race perfectly well with code assuming the domains are right. Cure this by building the domains from cpu_active_mask on CPU_DOWN_PREPARE. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/cpu.c | 18 +++++++++++++----- kernel/cpuset.c | 16 +++++++++------- kernel/sched.c | 32 +++++++++++++++++--------------- 3 files changed, 39 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ba0f1ecb212..b21688640377 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { + set_cpu_active(cpu, true); + nr_calls--; __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); @@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) /* Ensure that we are not runnable on dying cpu */ cpumask_copy(old_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); + set_cpus_allowed_ptr(current, cpu_active_mask); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { + set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu) err = _cpu_down(cpu, 0); - if (cpu_online(cpu)) - set_cpu_active(cpu, true); - out: cpu_maps_update_done(); stop_machine_destroy(); @@ -387,6 +386,15 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); + + for_each_online_cpu(cpu) { + if (cpu == first_cpu) + continue; + set_cpu_active(cpu, false); + } + + synchronize_sched(); + printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 43fb7e800028..ba401fab459f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } retval = validate_change(cs, trialcs); @@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_online_mask); + cpu_active_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); @@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, switch (phase) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: break; default: @@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, cgroup_lock(); mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); @@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); diff --git a/kernel/sched.c b/kernel/sched.c index aa31244caa9f..281da29d0801 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4134,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_copy(cpus, cpu_online_mask); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4297,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_copy(cpus, cpu_online_mask); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4694,7 +4694,7 @@ int select_nohz_load_balancer(int stop_tick) cpumask_set_cpu(cpu, nohz.cpu_mask); /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -7093,7 +7093,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpumask_intersects(new_mask, cpu_online_mask)) { + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } @@ -7115,7 +7115,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ struct task_struct *mt = rq->migration_thread; @@ -7269,19 +7269,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) again: /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto move; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); if (dest_cpu < nr_cpu_ids) goto move; /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); /* * Don't tell them about moving exiting tasks or @@ -7310,7 +7310,7 @@ move: */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); unsigned long flags; local_irq_save(flags); @@ -7564,7 +7564,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) static struct ctl_table_header *sd_sysctl_header; static void register_sched_domain_sysctl(void) { - int i, cpu_num = num_online_cpus(); + int i, cpu_num = num_possible_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; @@ -7574,7 +7574,7 @@ static void register_sched_domain_sysctl(void) if (entry == NULL) return; - for_each_online_cpu(i) { + for_each_possible_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; @@ -9100,7 +9100,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -9231,8 +9231,10 @@ static int update_sched_domains(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; @@ -9279,7 +9281,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_online_mask); + arch_init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); -- cgit v1.2.1 From 56053170ea2a2c0dc17420e9b94aa3ca51d80408 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Dec 2009 06:46:48 +0100 Subject: hw-breakpoints: Fix task-bound breakpoint slot allocation Whatever the context nature of a breakpoint, we always perform the following constraint checks before allocating it a slot: - Check the number of pinned breakpoint bound the concerned cpus - Check the max number of task-bound breakpoints that are belonging to a task. - Add both and see if we have a reamining slot for the new breakpoint This is the right thing to do when we are about to register a cpu-only bound breakpoint. But not if we are dealing with a task bound breakpoint. What we want in this case is: - Check the number of pinned breakpoint bound the concerned cpus - Check the number of breakpoints that already belong to the task in which the breakpoint to register is bound to. - Add both This fixes a regression that makes the "firefox -g" command fail to register breakpoints once we deal with a secondary thread. Reported-by: Walt Signed-off-by: Frederic Weisbecker Cc: Prasad --- kernel/hw_breakpoint.c | 74 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index b600fc27f161..02b492504a5a 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -83,15 +83,51 @@ static unsigned int max_task_bp_pinned(int cpu) return 0; } +static int task_bp_pinned(struct task_struct *tsk) +{ + struct perf_event_context *ctx = tsk->perf_event_ctxp; + struct list_head *list; + struct perf_event *bp; + unsigned long flags; + int count = 0; + + if (WARN_ONCE(!ctx, "No perf context for this task")) + return 0; + + list = &ctx->event_list; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * The current breakpoint counter is not included in the list + * at the open() callback time + */ + list_for_each_entry(bp, list, event_entry) { + if (bp->attr.type == PERF_TYPE_BREAKPOINT) + count++; + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + return count; +} + /* * Report the number of pinned/un-pinned breakpoints we have in * a given cpu (cpu > -1) or in all of them (cpu = -1). */ -static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) { + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + if (cpu >= 0) { slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); - slots->pinned += max_task_bp_pinned(cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu); + else + slots->pinned += task_bp_pinned(tsk); slots->flexible = per_cpu(nr_bp_flexible, cpu); return; @@ -101,7 +137,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) unsigned int nr; nr = per_cpu(nr_cpu_bp_pinned, cpu); - nr += max_task_bp_pinned(cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu); + else + nr += task_bp_pinned(tsk); if (nr > slots->pinned) slots->pinned = nr; @@ -118,33 +157,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) */ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) { - int count = 0; - struct perf_event *bp; - struct perf_event_context *ctx = tsk->perf_event_ctxp; unsigned int *tsk_pinned; - struct list_head *list; - unsigned long flags; - - if (WARN_ONCE(!ctx, "No perf context for this task")) - return; - - list = &ctx->event_list; - - spin_lock_irqsave(&ctx->lock, flags); - - /* - * The current breakpoint counter is not included in the list - * at the open() callback time - */ - list_for_each_entry(bp, list, event_entry) { - if (bp->attr.type == PERF_TYPE_BREAKPOINT) - count++; - } - - spin_unlock_irqrestore(&ctx->lock, flags); + int count = 0; - if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) - return; + count = task_bp_pinned(tsk); tsk_pinned = per_cpu(task_bp_pinned, cpu); if (enable) { @@ -233,7 +249,7 @@ int reserve_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); - fetch_bp_busy_slots(&slots, bp->cpu); + fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ if (slots.pinned + (!!slots.flexible) == HBP_NUM) { -- cgit v1.2.1 From c521efd1700a8c0f7ce26f011f5eaecca17fabfa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Dec 2009 09:06:24 -0500 Subject: tracing: Add pipe_close interface An ftrace plugin can add a pipe_open interface when the user opens trace_pipe. But if the plugin allocates something within the pipe_open it can not free it because there exists no pipe_close. The hook to the trace file open has a corresponding close. The closing of the trace_pipe file should also have a corresponding close. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++++ kernel/trace/trace.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 874f2893cff0..f804b407d438 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2898,6 +2898,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) else cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); + + if (iter->trace->pipe_open) + iter->trace->pipe_close(iter); + mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1d7f4830a80d..7fa33cab6962 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -272,6 +272,7 @@ struct tracer_flags { * @pipe_open: called when the trace_pipe file is opened * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released + * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe * @splice_read: override the default splice_read callback on trace_pipe * @selftest: selftest to run on boot (see trace_selftest.c) @@ -290,6 +291,7 @@ struct tracer { void (*pipe_open)(struct trace_iterator *iter); void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); + void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); -- cgit v1.2.1 From 6ab8886326a1b9a3a8d164d8174e3c20703a03a2 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 8 Dec 2009 18:25:15 +1100 Subject: perf: hw_breakpoints: Fix percpu namespace clash Today's linux-next build failed with: kernel/hw_breakpoint.c:86: error: 'task_bp_pinned' redeclared as different kind of symbol ... Caused by commit dd17c8f72993f9461e9c19250e3f155d6d99df22 ("percpu: remove per_cpu__ prefix") from the percpu tree interacting with commit 56053170ea2a2c0dc17420e9b94aa3ca51d80408 ("hw-breakpoints: Fix task-bound breakpoint slot allocation") from the tip tree. Signed-off-by: Stephen Rothwell Acked-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Tejun Heo Cc: Rusty Russell Cc: Christoph Lameter Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: <20091208182515.bb6dda4a.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 02b492504a5a..03a0773ac2b2 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -52,7 +52,7 @@ static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); +static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); /* Number of non-pinned cpu/task breakpoints in a cpu */ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); @@ -73,7 +73,7 @@ static DEFINE_MUTEX(nr_bp_mutex); static unsigned int max_task_bp_pinned(int cpu) { int i; - unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); for (i = HBP_NUM -1; i >= 0; i--) { if (tsk_pinned[i] > 0) @@ -162,7 +162,7 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) count = task_bp_pinned(tsk); - tsk_pinned = per_cpu(task_bp_pinned, cpu); + tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); if (enable) { tsk_pinned[count]++; if (count > 0) @@ -209,7 +209,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to a single cpu, check: * * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. @@ -220,7 +220,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to every cpus, check: * * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks @@ -232,7 +232,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to a single cpu, check: * * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM * * -> Same checks as before. But now the nr_bp_flexible, if any, must keep * one register at least (or they will never be fed). @@ -240,7 +240,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to every cpus, check: * * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ int reserve_bp_slot(struct perf_event *bp) { -- cgit v1.2.1 From 722d0172377a5697919b9f7e5beb95165b1dec4e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 8 Dec 2009 13:19:42 +0100 Subject: futex: Take mmap_sem for get_user_pages in fault_in_user_writeable get_user_pages() must be called with mmap_sem held. Signed-off-by: Andi Kleen Cc: stable@kernel.org Cc: Andrew Morton Cc: Nick Piggin Cc: Darren Hart Cc: Peter Zijlstra LKML-Reference: <20091208121942.GA21298@basil.fritz.box> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index fb65e822fc41..d73ef1f3e55d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -304,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key) */ static int fault_in_user_writeable(u32 __user *uaddr) { - int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, (unsigned long)uaddr, + 1, 1, 0, NULL, NULL); + up_read(&mm->mmap_sem); + return ret < 0 ? ret : 0; } -- cgit v1.2.1 From a7c312bed772c11138409c3a98531e85d690302e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:03:16 -0500 Subject: trace-kprobe: Support delete probe syntax Support delete probe syntax. The syntax is "-:[group/]event". Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220316.10142.39192.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aff5f80b59b8..bf05fb49a6f3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -606,23 +606,22 @@ static int create_trace_probe(int argc, char **argv) */ struct trace_probe *tp; int i, ret = 0; - int is_return = 0; + int is_return = 0, is_delete = 0; char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } - + /* argc must be >= 1 */ if (argv[0][0] == 'p') is_return = 0; else if (argv[0][0] == 'r') is_return = 1; + else if (argv[0][0] == '-') + is_delete = 1; else { - pr_info("Probe definition must be started with 'p' or 'r'.\n"); + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); return -EINVAL; } @@ -642,7 +641,29 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } } + if (!group) + group = KPROBE_EVENT_SYSTEM; + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + tp = find_probe_event(event, group); + if (!tp) { + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_probe(tp); + free_trace_probe(tp); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } if (isdigit(argv[1][0])) { if (is_return) { pr_info("Return probe point must be a symbol.\n"); @@ -671,8 +692,6 @@ static int create_trace_probe(int argc, char **argv) argc -= 2; argv += 2; /* setup a probe */ - if (!group) - group = KPROBE_EVENT_SYSTEM; if (!event) { /* Make a new event name */ if (symbol) -- cgit v1.2.1 From 44234adcdce38f83c56e05f808ce656175b4beeb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Dec 2009 09:25:48 +0100 Subject: hw-breakpoints: Modify breakpoints without unregistering them Currently, when ptrace needs to modify a breakpoint, like disabling it, changing its address, type or len, it calls modify_user_hw_breakpoint(). This latter will perform the heavy and racy task of unregistering the old breakpoint and registering a new one. This is racy as someone else might steal the reserved breakpoint slot under us, which is undesired as the breakpoint is only supposed to be modified, sometimes in the middle of a debugging workflow. We don't want our slot to be stolen in the middle. So instead of unregistering/registering the breakpoint, just disable it while we modify its breakpoint fields and re-enable it after if necessary. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Prasad LKML-Reference: <1260347148-5519-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 42 ++++++++++++++++++++++++++++++++---------- kernel/perf_event.c | 4 ++-- 2 files changed, 34 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 03a0773ac2b2..366eedf949c0 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -320,18 +320,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); * @triggered: callback to trigger when we hit the breakpoint * @tsk: pointer to 'task_struct' of the process to which the address belongs */ -struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - /* - * FIXME: do it without unregistering - * - We don't want to lose our slot - * - If the new bp is incorrect, don't lose the older one - */ - unregister_hw_breakpoint(bp); + u64 old_addr = bp->attr.bp_addr; + int old_type = bp->attr.bp_type; + int old_len = bp->attr.bp_len; + int err = 0; + + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; - return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, - bp->overflow_handler); + err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fd43ff4ac860..3b0cf86eee84 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -567,7 +567,7 @@ static void __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -static void perf_event_disable(struct perf_event *event) +void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -971,7 +971,7 @@ static void __perf_event_enable(void *info) * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -static void perf_event_enable(struct perf_event *event) +void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; -- cgit v1.2.1 From aa5452d70c0d559310598b243b8b1033c10056e7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:28:13 +0800 Subject: perf_event: Clean up __perf_event_init_context() Clean up the code a bit: - define 'perf_cpu_context' variable with 'static' - use kzalloc() instead of kmalloc() and memset() Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F194D.7080306@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3b0cf86eee84..2b06c45bfba9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -36,7 +36,7 @@ /* * Each CPU has a list of per CPU events: */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); int perf_max_events __read_mostly = 1; static int perf_reserved_percpu __read_mostly; @@ -1579,7 +1579,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { - memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->group_list); @@ -1654,7 +1653,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) } if (!ctx) { - ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); err = -ENOMEM; if (!ctx) goto errout; @@ -5105,7 +5104,7 @@ int perf_event_init_task(struct task_struct *child) * First allocate and initialize a context for the child. */ - child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + child_ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!child_ctx) return -ENOMEM; -- cgit v1.2.1 From b93f7978ad6b46133e9453b90ccc057dc2429e75 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:29:44 +0800 Subject: perf_event: Allocate children's perf_event_ctxp at the right time In current code, children task will allocate memory for 'child->perf_event_ctxp' if the parent is counted, we can do it only if the parent allowed children inherit it. It can save memory and reduce overhead. Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F19A8.5040805@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2b06c45bfba9..77641ae6b23f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5083,7 +5083,7 @@ again: */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx = NULL, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -5098,20 +5098,6 @@ int perf_event_init_task(struct task_struct *child) if (likely(!parent->perf_event_ctxp)) return 0; - /* - * This is executed from the parent task context, so inherit - * events that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - /* * If the parent's context is a clone, pin it so it won't get * swapped under us. @@ -5142,6 +5128,26 @@ int perf_event_init_task(struct task_struct *child) continue; } + if (!child->perf_event_ctxp) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) { + ret = -ENOMEM; + goto exit; + } + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) { @@ -5170,6 +5176,7 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } +exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); -- cgit v1.2.1 From ec89a06fd4e12301f11ab039ee07d2353a18addc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:30:36 +0800 Subject: perf_event: Cleanup for cpu_clock_perf_event_update() Using atomic64_xchg() instead of atomic64_read() and atomic64_set(). Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F19DC.90204@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 77641ae6b23f..94e1b28333ae 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4079,8 +4079,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_read(&event->hw.prev_count); - atomic64_set(&event->hw.prev_count, now); + prev = atomic64_xchg(&event->hw.prev_count, now); atomic64_add(now - prev, &event->count); } -- cgit v1.2.1 From 3160568371da441b7f2fb57f2f1225404207e8f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Dec 2009 20:24:16 +0000 Subject: sched: Protect task->cpus_allowed access in sched_getaffinity() sched_getaffinity() is not protected against a concurrent modification of the tasks affinity. Serialize the access with task_rq_lock(task). Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra LKML-Reference: <20091208202026.769251187@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 281da29d0801..c4635f74540c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6631,6 +6631,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; + unsigned long flags; + struct rq *rq; int retval; get_online_cpus(); @@ -6645,7 +6647,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + task_rq_unlock(rq, &flags); out_unlock: read_unlock(&tasklist_lock); -- cgit v1.2.1 From dba091b9e3522b9d32fc9975e48d3b69633b45f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 09:32:03 +0100 Subject: sched: Protect sched_rr_get_param() access to task->sched_class sched_rr_get_param calls task->sched_class->get_rr_interval(task) without protection against a concurrent sched_setscheduler() call which modifies task->sched_class. Serialize the access with task_rq_lock(task) and hand the rq pointer into get_rr_interval() as it's needed at least in the sched_fair implementation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++++- kernel/sched_fair.c | 6 +----- kernel/sched_idletask.c | 2 +- kernel/sched_rt.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c4635f74540c..68db5a2e6545 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6887,6 +6887,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; + unsigned long flags; + struct rq *rq; int retval; struct timespec t; @@ -6903,7 +6905,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - time_slice = p->sched_class->get_rr_interval(p); + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f61837ad336d..613c1c749677 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2014,21 +2014,17 @@ static void moved_group_fair(struct task_struct *p) } #endif -unsigned int get_rr_interval_fair(struct task_struct *task) +unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { struct sched_entity *se = &task->se; - unsigned long flags; - struct rq *rq; unsigned int rr_interval = 0; /* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */ - rq = task_rq_lock(task, &flags); if (rq->cfs.load.weight) rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); return rr_interval; } diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index b133a28fcde3..33d5384a73a8 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } -unsigned int get_rr_interval_idle(struct task_struct *task) +unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) { return 0; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 5c5fef378415..aecbd9c6b20c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } -unsigned int get_rr_interval_rt(struct task_struct *task) +unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* * Time slice is 0 for SCHED_FIFO tasks -- cgit v1.2.1 From 6b314d0e11924c803bf8cd944e87fd58cdb5088c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Dec 2009 18:58:05 +0100 Subject: sched: Remove sysctl.sched_features Since we've had a much saner debugfs interface to this, remove the sysctl one. Signed-off-by: Peter Zijlstra LKML-Reference: [ v2: build fix ] Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dbf93a52ee9..e5cc53514caa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -314,14 +314,6 @@ static struct ctl_table kern_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_migration_cost", -- cgit v1.2.1 From 970b13bacba14a8cef6f642861947df1d175b0b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Nov 2009 13:31:39 +0100 Subject: sched: Consolidate select_task_rq() callers Small cleanup. Signed-off-by: Peter Zijlstra LKML-Reference: [ v2: build fix ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 68db5a2e6545..01fd131b47a4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2323,6 +2323,14 @@ void task_oncpu_function_call(struct task_struct *p, preempt_enable(); } +#ifdef CONFIG_SMP +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + return p->sched_class->select_task_rq(p, sd_flags, wake_flags); +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2376,7 +2384,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, p->state = TASK_WAKING; task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) { local_irq_save(flags); rq = cpu_rq(cpu); @@ -2593,7 +2601,7 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_class = &fair_sched_class; #ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif local_irq_save(flags); update_rq_clock(cpu_rq(cpu)); @@ -3156,7 +3164,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); + new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); -- cgit v1.2.1 From 5afcdab706d6002cb02b567ba46e650215e694e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 14:12:25 +0100 Subject: sched: Remove rq->clock coupling from set_task_cpu() set_task_cpu() should be rq invariant and only touch task state, it currently fails to do so, which opens up a few races, since not all callers hold both rq->locks. Remove the relyance on rq->clock, as any site calling set_task_cpu() should also do a remote clock update, which should ensure the observed time between these two cpus is monotonic, as per kernel/sched_clock.c:sched_clock_remote(). Therefore we can simply remove the clock_offset bits and be happy. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 01fd131b47a4..1f9c6d99f15d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2060,23 +2060,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + struct rq *old_rq = cpu_rq(old_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif if (old_cpu != new_cpu) { p->se.nr_migrations++; #ifdef CONFIG_SCHEDSTATS -- cgit v1.2.1 From ab19cb23313733c55e0517607844b86720b35f5f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 15:44:43 +0100 Subject: sched: Clean up ttwu() rq locking Since set_task_clock() doesn't rely on rq->clock anymore we can simplyfy the mess in ttwu(). Optimize things a bit by not fiddling with the IRQ state there. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1f9c6d99f15d..c92670f8e097 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2371,17 +2371,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) { - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); + if (cpu != orig_cpu) set_task_cpu(p, cpu); - local_irq_restore(flags); - } - rq = task_rq_lock(p, &flags); + + rq = __task_rq_lock(p); + update_rq_clock(rq); WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); -- cgit v1.2.1 From cd29fe6f2637cc2ccbda5ac65f5332d6bf5fa3c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 17:32:46 +0100 Subject: sched: Sanitize fork() handling Currently we try to do task placement in wake_up_new_task() after we do the load-balance pass in sched_fork(). This yields complicated semantics in that we have to deal with tasks on different RQs and the set_task_cpu() calls in copy_process() and sched_fork() Rename ->task_new() to ->task_fork() and call it from sched_fork() before the balancing, this gives the policy a clear point to place the task. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 47 ++++++++++++++++++----------------------------- kernel/sched_fair.c | 28 +++++++++++++++------------- 2 files changed, 33 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c92670f8e097..33c903573132 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1811,6 +1811,20 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_active(struct rq *this_rq); +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -1967,20 +1981,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio, int running) @@ -2552,7 +2552,6 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); - unsigned long flags; __sched_fork(p); @@ -2586,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + #ifdef CONFIG_SMP cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif - local_irq_save(flags); - update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); - local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2625,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 613c1c749677..44ec80ccfa85 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1922,28 +1922,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } /* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled */ -static void task_new_fair(struct rq *rq, struct task_struct *p) +static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq = task_cfs_rq(current); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); + struct rq *rq = this_rq(); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); - sched_info_queued(p); + if (unlikely(task_cpu(p) != this_cpu)) + __set_task_cpu(p, this_cpu); update_curr(cfs_rq); + if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); - /* 'curr' will be NULL if the child belongs to a different group */ - if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && entity_before(curr, se)) { + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. @@ -1952,7 +1954,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } - enqueue_task_fair(rq, p, 0); + spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -2052,7 +2054,7 @@ static const struct sched_class fair_sched_class = { .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, - .task_new = task_new_fair, + .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, -- cgit v1.2.1 From a65ac745e47e91f9d98dbf07f22ed0492e34d998 Mon Sep 17 00:00:00 2001 From: Jupyung Lee Date: Tue, 17 Nov 2009 18:51:40 +0900 Subject: sched: Move update_curr() in check_preempt_wakeup() to avoid redundant call If a RT task is woken up while a non-RT task is running, check_preempt_wakeup() is called to check whether the new task can preempt the old task. The function returns quickly without going deeper because it is apparent that a RT task can always preempt a non-RT task. In this situation, check_preempt_wakeup() always calls update_curr() to update vruntime value of the currently running task. However, the function call is unnecessary and redundant at that moment because (1) a non-RT task can always be preempted by a RT task regardless of its vruntime value, and (2) update_curr() will be called shortly when the context switch between two occurs. By moving update_curr() in check_preempt_wakeup(), we can avoid redundant call to update_curr(), slightly reducing the time taken to wake up RT tasks. Signed-off-by: Jupyung Lee [ Place update_curr() right before the wake_preempt_entity() call, which is the only thing that relies on the updated vruntime ] Signed-off-by: Peter Zijlstra LKML-Reference: <1258451500-6714-1-git-send-email-jupyung@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 44ec80ccfa85..4dec18579c9a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1651,8 +1651,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; - update_curr(cfs_rq); - if (unlikely(rt_prio(p->prio))) { resched_task(curr); return; @@ -1710,6 +1708,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ BUG_ON(!pse); + update_curr(cfs_rq); + if (wakeup_preempt_entity(se, pse) == 1) { resched_task(curr); /* -- cgit v1.2.1 From 3a7e73a2e26fffdbc46ba95fc0425418984f5140 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 28 Nov 2009 18:51:02 +0100 Subject: sched: Clean up check_preempt_wakeup() Streamline the wakeup preemption code a bit, unifying the preempt path so that they all do the same. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 73 ++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4dec18579c9a..76b5792c4198 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1651,10 +1651,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; - if (unlikely(rt_prio(p->prio))) { - resched_task(curr); - return; - } + if (unlikely(rt_prio(p->prio))) + goto preempt; if (unlikely(p->sched_class != &fair_sched_class)) return; @@ -1680,52 +1678,47 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) { - resched_task(curr); - return; - } + if (unlikely(curr->policy == SCHED_IDLE)) + goto preempt; - if ((sched_feat(WAKEUP_SYNC) && sync) || - (sched_feat(WAKEUP_OVERLAP) && - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { - resched_task(curr); - return; - } + if (sched_feat(WAKEUP_SYNC) && sync) + goto preempt; - if (sched_feat(WAKEUP_RUNNING)) { - if (pse->avg_running < se->avg_running) { - set_next_buddy(pse); - resched_task(curr); - return; - } - } + if (sched_feat(WAKEUP_OVERLAP) && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) + goto preempt; + + if (sched_feat(WAKEUP_RUNNING) && pse->avg_running < se->avg_running) + goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; + update_curr(cfs_rq); find_matching_se(&se, &pse); - BUG_ON(!pse); + if (wakeup_preempt_entity(se, pse) == 1) + goto preempt; - update_curr(cfs_rq); + return; - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); - } +preempt: + resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.2.1 From 6cecd084d0fd27bb1e498e2829fd45846d806856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Nov 2009 13:00:37 +0100 Subject: sched: Discard some old bits WAKEUP_RUNNING was an experiment, not sure why that ever ended up being merged... Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++---------- kernel/sched_debug.c | 1 - kernel/sched_fair.c | 3 --- kernel/sched_features.h | 5 ----- 4 files changed, 7 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 33c903573132..0170735bdafc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2493,7 +2493,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; - p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -5379,13 +5378,14 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *p) +static void put_prev_task(struct rq *rq, struct task_struct *prev) { - u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; - update_avg(&p->se.avg_running, runtime); + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5395,12 +5395,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p) * correlates to the amount of cache footprint a task can * build up. */ - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - update_avg(&p->se.avg_overlap, runtime); - } else { - update_avg(&p->se.avg_running, 0); + update_avg(&prev->se.avg_overlap, runtime); } - p->sched_class->put_prev_task(rq, p); + prev->sched_class->put_prev_task(rq, prev); } /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6988cf08f705..5fda66615fee 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -399,7 +399,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); - PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 76b5792c4198..e9f5daee12c7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1689,9 +1689,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ pse->avg_overlap < sysctl_sched_migration_cost) goto preempt; - if (sched_feat(WAKEUP_RUNNING) && pse->avg_running < se->avg_running) - goto preempt; - if (!sched_feat(WAKEUP_PREEMPT)) return; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 0d94083582c7..d5059fd761d9 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -53,11 +53,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0) */ SCHED_FEAT(WAKEUP_OVERLAP, 0) -/* - * Wakeup preemption towards tasks that run short - */ -SCHED_FEAT(WAKEUP_RUNNING, 0) - /* * Use the SYNC wakeup hint, pipes and the likes use this to indicate * the remote end is likely to consume the data we just wrote, and -- cgit v1.2.1 From fb58bac5c75bfff8bbf7d02071a10a62f32fe28b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Dec 2009 12:21:47 +0100 Subject: sched: Remove unnecessary RCU exclusion As Nick pointed out, and realized by myself when doing: sched: Fix balance vs hotplug race the patch: sched: for_each_domain() vs RCU is wrong, sched_domains are freed after synchronize_sched(), which means disabling preemption is enough. Reported-by: Nick Piggin Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e9f5daee12c7..c163a285bf05 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1403,7 +1403,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag new_cpu = prev_cpu; } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If power savings logic is enabled for a domain, see if we @@ -1484,10 +1483,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag update_shares(tmp); } - if (affine_sd && wake_affine(affine_sd, p, sync)) { - new_cpu = cpu; - goto out; - } + if (affine_sd && wake_affine(affine_sd, p, sync)) + return cpu; while (sd) { int load_idx = sd->forkexec_idx; @@ -1528,8 +1525,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* while loop will break here if sd == NULL */ } -out: - rcu_read_unlock(); return new_cpu; } #endif /* CONFIG_SMP */ -- cgit v1.2.1 From cd8ad40de36c2fe75f3b731bd70198b385895246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Dec 2009 18:00:07 +0100 Subject: sched: cgroup: Implement different treatment for idle shares When setting the weight for a per-cpu task-group, we have to put in a phantom weight when there is no work on that cpu, otherwise we'll not service that cpu when new work gets placed there until we again update the per-cpu weights. We used to add these phantom weights to the total, so that the idle per-cpu shares don't get inflated, this however causes the non-idle parts to get deflated, causing unexpected weight distibutions. Reverse this, so that the non-idle shares are correct but the idle shares are inflated. Reported-by: Yasunori Goto Tested-by: Yasunori Goto Signed-off-by: Peter Zijlstra LKML-Reference: <1257934048.23203.76.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0170735bdafc..71eb0622f548 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1614,7 +1614,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, shares = 0; + unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; @@ -1630,6 +1630,7 @@ static int tg_shares_up(struct task_group *tg, void *data) weight = tg->cfs_rq[i]->load.weight; usd_rq_weight[i] = weight; + rq_weight += weight; /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1638,10 +1639,13 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!weight) weight = NICE_0_LOAD; - rq_weight += weight; + sum_weight += weight; shares += tg->cfs_rq[i]->shares; } + if (!rq_weight) + rq_weight = sum_weight; + if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; -- cgit v1.2.1 From 57785df5ac53c70da9fb53696130f3c551bfe1f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Dec 2009 09:59:02 +0100 Subject: sched: Fix task priority bug 83f9ac removed a call to effective_prio() in wake_up_new_task(), which leads to tasks running at MAX_PRIO. This is caused by the idle thread being set to MAX_PRIO before forking off init. O(1) used that to make sure idle was always preempted, CFS uses check_preempt_curr_idle() for that so we can savely remove this bit of legacy code. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1259754383.4003.610.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 71eb0622f548..3878f5018007 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3158,10 +3158,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ check_preempt_curr(this_rq, p, 0); } @@ -6992,7 +6988,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) __sched_fork(idle); idle->se.exec_start = sched_clock(); - idle->prio = idle->normal_prio = MAX_PRIO; cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); @@ -7696,7 +7691,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); -- cgit v1.2.1 From 0bcdcf28c979869f44e05121b96ff2cfb05bd8e6 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:46 +0100 Subject: sched: Fix missing sched tunable recalculation on cpu add/remove Based on Peter Zijlstras patch suggestion this enables recalculation of the scheduler tunables in response of a change in the number of cpus. It also adds a max of eight cpus that are considered in that scaling. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-2-git-send-email-ehrhardt@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++++++------------- kernel/sched_fair.c | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3878f5018007..b54ecf84b6be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; * default: 0.25ms */ unsigned int sysctl_sched_shares_ratelimit = 250000; +unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; /* * Inject some fuzzyness into changing the per-cpu group shares @@ -1814,6 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif static void calc_load_account_active(struct rq *this_rq); +static void update_sysctl(void); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -7028,22 +7030,23 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static inline void sched_init_granularity(void) +static void update_sysctl(void) { - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; - - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; - - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; + unsigned int cpus = min(num_online_cpus(), 8U); + unsigned int factor = 1 + ilog2(cpus); - sysctl_sched_wakeup_granularity *= factor; +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_shares_ratelimit); +#undef SET_SYSCTL +} - sysctl_sched_shares_ratelimit *= factor; +static inline void sched_init_granularity(void) +{ + update_sysctl(); } #ifdef CONFIG_SMP diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c163a285bf05..71b3458245e5 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -35,12 +35,14 @@ * run vmstat and monitor the context-switches (cs) field) */ unsigned int sysctl_sched_latency = 5000000ULL; +unsigned int normalized_sysctl_sched_latency = 5000000ULL; /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 1000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -70,6 +72,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; * have immediate wakeup/sleep latencies. */ unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -1890,6 +1893,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } + +static void rq_online_fair(struct rq *rq) +{ + update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ + update_sysctl(); +} + #endif /* CONFIG_SMP */ /* @@ -2035,6 +2049,8 @@ static const struct sched_class fair_sched_class = { .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, #endif .set_curr_task = set_curr_task_fair, -- cgit v1.2.1 From 1983a922a1bc843806b9a36cf3a370b242783140 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:47 +0100 Subject: sched: Make tunable scaling style configurable As scaling now takes place on all kind of cpu add/remove events a user that configures values via proc should be able to configure if his set values are still rescaled or kept whatever happens. As the comments state that log2 was just a second guess that worked the interface is not just designed for on/off, but to choose a scaling type. Currently this allows none, log and linear, but more important it allwos us to keep the interface even if someone has an even better idea how to scale the values. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-3-git-send-email-ehrhardt@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++++- kernel/sched_debug.c | 10 ++++++++++ kernel/sched_fair.c | 13 +++++++++++++ kernel/sysctl.c | 14 ++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b54ecf84b6be..116efed962c6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7033,7 +7033,20 @@ cpumask_var_t nohz_cpu_mask; static void update_sysctl(void) { unsigned int cpus = min(num_online_cpus(), 8U); - unsigned int factor = 1 + ilog2(cpus); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5fda66615fee..0fc5287fe80f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu) print_rq(m, rq, cpu); } +static const char *sched_tunable_scaling_names[] = { + "none", + "logaritmic", + "linear" +}; + static int sched_debug_show(struct seq_file *m, void *v) { u64 now = ktime_to_ns(ktime_get()); @@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P + SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + for_each_online_cpu(cpu) print_cpu(m, cpu); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 71b3458245e5..455106d318a8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,6 +21,7 @@ */ #include +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -37,6 +38,18 @@ unsigned int sysctl_sched_latency = 5000000ULL; unsigned int normalized_sysctl_sched_latency = 5000000ULL; +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; + /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e5cc53514caa..d10406e5fdfe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -251,6 +251,8 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; #endif static struct ctl_table kern_table[] = { @@ -304,6 +306,18 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, + }, + { .ctl_name = CTL_UNNUMBERED, .procname = "sched_shares_thresh", -- cgit v1.2.1 From acb4a848da821a095ae9e4d8b22ae2d9633ba5cd Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:48 +0100 Subject: sched: Update normalized values on user updates via proc The normalized values are also recalculated in case the scaling factor changes. This patch updates the internally used scheduler tuning values that are normalized to one cpu in case a user sets new values via sysfs. Together with patch 2 of this series this allows to let user configured values scale (or not) to cpu add/remove events taking place later. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-4-git-send-email-ehrhardt@linux.vnet.ibm.com> [ v2: fix warning ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++-- kernel/sched_fair.c | 11 ++++++++++- kernel/sysctl.c | 14 +++++++++----- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 116efed962c6..0a60e8e9b094 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1816,6 +1816,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_active(struct rq *this_rq); static void update_sysctl(void); +static int get_update_sysctl_factor(void); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -7030,9 +7031,9 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static void update_sysctl(void) +static int get_update_sysctl_factor(void) { - unsigned int cpus = min(num_online_cpus(), 8U); + unsigned int cpus = min(num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { @@ -7048,6 +7049,13 @@ static void update_sysctl(void) break; } + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) SET_SYSCTL(sched_min_granularity); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 455106d318a8..804a411838f1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -399,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ #ifdef CONFIG_SCHED_DEBUG -int sched_nr_latency_handler(struct ctl_table *table, int write, +int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -411,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_shares_ratelimit); +#undef WRT_SYSCTL + return 0; } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d10406e5fdfe..b9e5a45f1e28 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -253,6 +253,8 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +static int min_sched_shares_ratelimit = 100000; /* 100 usec */ +static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { @@ -271,7 +273,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, @@ -282,7 +284,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, @@ -293,7 +295,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_wakeup_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, @@ -304,7 +306,9 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_shares_ratelimit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_proc_update_handler, + .extra1 = &min_sched_shares_ratelimit, + .extra2 = &max_sched_shares_ratelimit, }, { .ctl_name = CTL_UNNUMBERED, @@ -312,7 +316,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_tunable_scaling, .maxlen = sizeof(enum sched_tunable_scaling), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_tunable_scaling, .extra2 = &max_sched_tunable_scaling, -- cgit v1.2.1 From 822a6961112f0c9101d3359d8524604c3309ee6c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 8 Dec 2009 10:00:04 +0100 Subject: tracing/kprobes: Fix field creation's bad error handling When we define the common event fields in kprobe, we invert the error handling and return immediately in case of success. Then we omit to define specific kprobes fields (ip and nargs), and specific kretprobes fields (func, ret_ip, nargs). And we only define them when we fail to create common fields. The most visible consequence is that we can't create filter for k(ret)probes specific fields. This patch re-invert the success/error handling to fix it. Reported-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Acked-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Li Zefan LKML-Reference: <1260263815-5167-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bf05fb49a6f3..b52d397e57eb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1133,7 +1133,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); - if (!ret) + if (ret) return ret; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); @@ -1151,7 +1151,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); - if (!ret) + if (ret) return ret; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); -- cgit v1.2.1 From 29bf4a5e3fed3dde3eb629a0cb1762c1e9217458 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Dec 2009 12:37:43 -0500 Subject: tracing: Only call pipe_close if pipe_close is defined This fixes a cut and paste error that had pipe_close get called if pipe_open was defined (not pipe_close). Reported-by: Kosaki Motohiro LKML-Reference: <20091209153204.F4CD.A69D9226@jp.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f804b407d438..dc937e1baa91 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2899,7 +2899,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); - if (iter->trace->pipe_open) + if (iter->trace->pipe_close) iter->trace->pipe_close(iter); mutex_unlock(&trace_types_lock); -- cgit v1.2.1 From a63ce5b306855bccdacba95c03bfc293316c8ae3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Dec 2009 09:11:39 -0500 Subject: tracing: Buffer the output of seq_file in case of filled buffer If the seq_read fills the buffer it will call s_start again on the next itertation with the same position. This causes a problem with the function_graph tracer because it consumes the iteration in order to determine leaf functions. What happens is that the iterator stores the entry, and the function graph plugin will look at the next entry. If that next entry is a return of the same function and task, then the function is a leaf and the function_graph plugin calls ring_buffer_read which moves the ring buffer iterator forward (the trace iterator still points to the function start entry). The copying of the trace_seq to the seq_file buffer will fail if the seq_file buffer is full. The seq_read will not show this entry. The next read by userspace will cause seq_read to again call s_start which will reuse the trace iterator entry (the function start entry). But the function return entry was already consumed. The function graph plugin will think that this entry is a nested function and not a leaf. To solve this, the trace code now checks the return status of the seq_printf (trace_print_seq). If the writing to the seq_file buffer fails, we set a flag in the iterator (leftover) and we do not reset the trace_seq buffer. On the next call to s_start, we check the leftover flag, and if it is set, we just reuse the trace_seq buffer and do not call into the plugin print functions. Before this patch: 2) | fput() { 2) | __fput() { 2) 0.550 us | inotify_inode_queue_event(); 2) | __fsnotify_parent() { 2) 0.540 us | inotify_dentry_parent_queue_event(); After the patch: 2) | fput() { 2) | __fput() { 2) 0.550 us | inotify_inode_queue_event(); 2) 0.548 us | __fsnotify_parent(); 2) 0.540 us | inotify_dentry_parent_queue_event(); [ Updated the patch to fix a missing return 0 from the trace_print_seq() stub when CONFIG_TRACING is disabled. Reported-by: Ingo Molnar ] Reported-by: Jiri Olsa Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 35 ++++++++++++++++++++++++++++++++--- kernel/trace/trace_output.c | 14 +++++++++++--- 2 files changed, 43 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc937e1baa91..484114d70743 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1516,6 +1516,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) int i = (int)*pos; void *ent; + WARN_ON_ONCE(iter->leftover); + (*pos)++; /* can't go backwards */ @@ -1614,8 +1616,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) ; } else { - l = *pos - 1; - p = s_next(m, p, &l); + /* + * If we overflowed the seq_file before, then we want + * to just reuse the trace_seq buffer again. + */ + if (iter->leftover) + p = iter; + else { + l = *pos - 1; + p = s_next(m, p, &l); + } } trace_event_read_lock(); @@ -1923,6 +1933,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; + int ret; if (iter->ent == NULL) { if (iter->tr) { @@ -1942,9 +1953,27 @@ static int s_show(struct seq_file *m, void *v) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_func_help_header(m); } + } else if (iter->leftover) { + /* + * If we filled the seq_file buffer earlier, we + * want to just show it now. + */ + ret = trace_print_seq(m, &iter->seq); + + /* ret should this time be zero, but you never know */ + iter->leftover = ret; + } else { print_trace_line(iter); - trace_print_seq(m, &iter->seq); + ret = trace_print_seq(m, &iter->seq); + /* + * If we overflow the seq_file buffer, then it will + * ask us for this data again at start up. + * Use that instead. + * ret is 0 if seq_file write succeeded. + * -1 otherwise. + */ + iter->leftover = ret; } return 0; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b6c12c6a1bcd..e5cf90fef34e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -void trace_print_seq(struct seq_file *m, struct trace_seq *s) +int trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + int ret; + + ret = seq_write(m, s->buffer, len); - seq_write(m, s->buffer, len); + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. + */ + if (!ret) + trace_seq_init(s); - trace_seq_init(s); + return ret; } enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) -- cgit v1.2.1 From d184b31c0e403580aafb3f8955ecc185a3d04801 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 25 Nov 2009 16:10:14 +0100 Subject: tracing: Add full state to trace_seq The trace_seq buffer might fill up, and right now one needs to check the return value of each printf into the buffer to check for that. Instead, have the buffer keep track of whether it is full or not, and reject more input if it is full or would have overflowed with an input that wasn't added. Cc: Lai Jiangshan Signed-off-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 61 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e5cf90fef34e..8e46b3323cdc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -93,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_list ap; int ret; - if (!len) + if (s->full || !len) return 0; va_start(ap, fmt); @@ -101,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -127,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = vsnprintf(s->buffer + s->len, len, fmt, args); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -147,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = bstr_printf(s->buffer + s->len, len, fmt, binary); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -175,9 +181,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str) { int len = strlen(str); - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) return 0; + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + memcpy(s->buffer + s->len, str, len); s->len += len; @@ -186,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str) int trace_seq_putc(struct trace_seq *s, unsigned char c) { - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) return 0; + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; + return 0; + } + s->buffer[s->len++] = c; return 1; @@ -196,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c) int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) { - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) return 0; + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + memcpy(s->buffer + s->len, mem, len); s->len += len; @@ -211,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) const unsigned char *data = mem; int i, j; + if (s->full) + return 0; + #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else @@ -228,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) { void *ret; - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; return NULL; + } ret = s->buffer + s->len; s->len += len; @@ -241,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path) { unsigned char *p; - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; + return 0; + } + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); if (!IS_ERR(p)) { p = mangle_path(s->buffer + s->len, p, "\n"); @@ -255,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 1; } + s->full = 1; return 0; } @@ -381,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, unsigned long vmstart = 0; int ret = 1; + if (s->full) + return 0; + if (mm) { const struct vm_area_struct *vma; -- cgit v1.2.1 From be1eca39319689aed7d3aedb9c3bece9469fe10f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 24 Nov 2009 13:57:38 +0100 Subject: tracing: Fix function graph trace_pipe to properly display failed entries There is a case where the graph tracer might get confused and omits displaying of a single record. This applies mostly with the trace_pipe since it is unlikely that the trace_seq buffer will overflow with the trace file. As the function_graph tracer goes through the trace entries keeping a pointer to the current record: current -> func1 ENTRY func2 ENTRY func2 RETURN func1 RETURN When an function ENTRY is encountered, it moves the pointer to the next entry to check if the function is a nested or leaf function. func1 ENTRY current -> func2 ENTRY func2 RETURN func1 RETURN If the rest of the writing of the function fills the trace_seq buffer, then the trace_pipe read will ignore this entry. The next read will Now start at the current location, but the first entry (func1) will be discarded. This patch keeps a copy of the current entry in the iterator private storage and will keep track of when the trace_seq buffer fills. When the trace_seq buffer fills, it will reuse the copy of the entry in the next iteration. [ This patch has been largely modified by Steven Rostedt in order to clean it up and simplify it. The original idea and concept was from Jirka and for that, this patch will go under his name to give him the credit he deserves. But because this was modify by Steven Rostedt anything wrong with the patch should be blamed on Steven. ] Signed-off-by: Jiri Olsa Cc: Frederic Weisbecker LKML-Reference: <1259067458-27143-1-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 165 +++++++++++++++++++++++++++-------- 1 file changed, 131 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 45e6c01b2e4d..a43d009c561a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,9 +14,20 @@ #include "trace.h" #include "trace_output.h" -struct fgraph_data { +struct fgraph_cpu_data { pid_t last_pid; int depth; + int ignore; +}; + +struct fgraph_data { + struct fgraph_cpu_data *cpu_data; + + /* Place to preserve last processed entry. */ + struct ftrace_graph_ent_entry ent; + struct ftrace_graph_ret_entry ret; + int failed; + int cpu; }; #define TRACE_GRAPH_INDENT 2 @@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (!data) return TRACE_TYPE_HANDLED; - last_pid = &(per_cpu_ptr(data, cpu)->last_pid); + last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); if (*last_pid == pid) return TRACE_TYPE_HANDLED; @@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry * get_return_for_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *curr) { - struct ring_buffer_iter *ring_iter; + struct fgraph_data *data = iter->private; + struct ring_buffer_iter *ring_iter = NULL; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *next; - ring_iter = iter->buffer_iter[iter->cpu]; + /* + * If the previous output failed to write to the seq buffer, + * then we just reuse the data from before. + */ + if (data && data->failed) { + curr = &data->ent; + next = &data->ret; + } else { - /* First peek to compare current entry and the next one */ - if (ring_iter) - event = ring_buffer_iter_peek(ring_iter, NULL); - else { - /* We need to consume the current entry to see the next one */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, - NULL); - } + ring_iter = iter->buffer_iter[iter->cpu]; + + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* + * We need to consume the current entry to see + * the next one. + */ + ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + NULL); + } - if (!event) - return NULL; + if (!event) + return NULL; + + next = ring_buffer_event_data(event); - next = ring_buffer_event_data(event); + if (data) { + /* + * Save current and next entries for later reference + * if the output fails. + */ + data->ent = *curr; + data->ret = *next; + } + } if (next->ent.type != TRACE_GRAPH_RET) return NULL; @@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* * Comments display at + 1 to depth. Since @@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); *depth = call->depth; } @@ -782,19 +816,34 @@ static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter) { - int cpu = iter->cpu; + struct fgraph_data *data = iter->private; struct ftrace_graph_ent *call = &field->graph_ent; struct ftrace_graph_ret_entry *leaf_ret; + static enum print_line_t ret; + int cpu = iter->cpu; if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) return TRACE_TYPE_PARTIAL_LINE; leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) - return print_graph_entry_leaf(iter, field, leaf_ret, s); + ret = print_graph_entry_leaf(iter, field, leaf_ret, s); else - return print_graph_entry_nested(iter, field, s, cpu); + ret = print_graph_entry_nested(iter, field, s, cpu); + if (data) { + /* + * If we failed to write our output, then we need to make + * note of it. Because we already consumed our entry. + */ + if (s->full) { + data->failed = 1; + data->cpu = cpu; + } else + data->failed = 0; + } + + return ret; } static enum print_line_t @@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* * Comments display at + 1 to depth. This is the @@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, int i; if (data) - depth = per_cpu_ptr(data, iter->cpu)->depth; + depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; if (print_graph_prologue(iter, s, 0, 0)) return TRACE_TYPE_PARTIAL_LINE; @@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t print_graph_function(struct trace_iterator *iter) { + struct ftrace_graph_ent_entry *field; + struct fgraph_data *data = iter->private; struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; + int cpu = iter->cpu; + int ret; + + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { + per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; + return TRACE_TYPE_HANDLED; + } + + /* + * If the last output failed, there's a possibility we need + * to print out the missing entry which would never go out. + */ + if (data && data->failed) { + field = &data->ent; + iter->cpu = data->cpu; + ret = print_graph_entry(field, s, iter); + if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { + per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; + ret = TRACE_TYPE_NO_CONSUME; + } + iter->cpu = cpu; + return ret; + } switch (entry->type) { case TRACE_GRAPH_ENT: { @@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter) * sizeof(struct ftrace_graph_ent_entry) is very small, * it can be safely saved at the stack. */ - struct ftrace_graph_ent_entry *field, saved; + struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); saved = *field; return print_graph_entry(&saved, s, iter); @@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s) static void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ - struct fgraph_data *data = alloc_percpu(struct fgraph_data); + struct fgraph_data *data; int cpu; + iter->private = NULL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - pr_warning("function graph tracer: not enough memory\n"); - else - for_each_possible_cpu(cpu) { - pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); - int *depth = &(per_cpu_ptr(data, cpu)->depth); - *pid = -1; - *depth = 0; - } + goto out_err; + + data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + if (!data->cpu_data) + goto out_err_free; + + for_each_possible_cpu(cpu) { + pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + *pid = -1; + *depth = 0; + *ignore = 0; + } iter->private = data; + + return; + + out_err_free: + kfree(data); + out_err: + pr_warning("function graph tracer: not enough memory\n"); } static void graph_trace_close(struct trace_iterator *iter) { - free_percpu(iter->private); + struct fgraph_data *data = iter->private; + + if (data) { + free_percpu(data->cpu_data); + kfree(data); + } } static struct tracer graph_trace __read_mostly = { .name = "function_graph", .open = graph_trace_open, + .pipe_open = graph_trace_open, .close = graph_trace_close, + .pipe_close = graph_trace_close, .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, -- cgit v1.2.1 From f2942487ffb0c0a80b2312f667ea30dd55a24bb0 Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Sun, 6 Dec 2009 14:02:44 +0100 Subject: tracing: Remove comparing of NULL to va_list in trace_array_vprintk() Olof Johansson stated the following: Comparing a va_list with NULL is bogus. It's supposed to be treated like an opaque type and only be manipulated with va_* accessors. Olof noticed that this code broke the ARM builds: kernel/trace/trace.c: In function 'trace_array_vprintk': kernel/trace/trace.c:1364: error: invalid operands to binary == (have 'va_list' and 'void *') kernel/trace/trace.c: In function 'tracing_mark_write': kernel/trace/trace.c:3349: error: incompatible type for argument 3 of 'trace_vprintk' This patch partly reverts c13d2f7c3231e873f30db92b96c8caa48f100f33 and re-installs the original mark_printk() mechanism. Reported-by: Olof Johansson Signed-off-by: Carsten Emde LKML-Reference: <4B1BAB74.104@osadl.org> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 484114d70743..88bd9ae2a9ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1361,11 +1361,7 @@ int trace_array_vprintk(struct trace_array *tr, pause_graph_tracing(); raw_local_irq_save(irq_flags); __raw_spin_lock(&trace_buf_lock); - if (args == NULL) { - strncpy(trace_buf, fmt, TRACE_BUF_SIZE); - len = strlen(trace_buf); - } else - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; buffer = tr->buffer; @@ -3353,6 +3349,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static int mark_printk(const char *fmt, ...) +{ + int ret; + va_list args; + va_start(args, fmt); + ret = trace_vprintk(0, fmt, args); + va_end(args); + return ret; +} + static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -3379,7 +3385,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else buf[cnt] = '\0'; - cnt = trace_vprintk(0, buf, NULL); + cnt = mark_printk("%s", buf); kfree(buf); *fpos += cnt; -- cgit v1.2.1 From 21140f4d3387aa2213f1deea0128df1dbf924379 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 10 Dec 2009 14:00:51 +0800 Subject: perf_event: Fix perf_swevent_hrtimer() variable initialization fix: [] ? printk+0x1d/0x24 [] ? perf_prepare_sample+0x269/0x280 [] warn_slowpath_common+0x71/0xd0 [] ? perf_prepare_sample+0x269/0x280 [] warn_slowpath_null+0x1a/0x20 [] perf_prepare_sample+0x269/0x280 [] ? cpu_clock+0x53/0x90 [] __perf_event_overflow+0x2a8/0x300 [] perf_event_overflow+0x1b/0x30 [] perf_swevent_hrtimer+0x7f/0x120 This is because 'data.raw' variable not initialize. Signed-off-by: Xiao Guangrong Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B208E93.1010801@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 94e1b28333ae..3a5d6c4786bb 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4010,6 +4010,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); data.addr = 0; + data.raw = NULL; data.period = event->hw.last_period; regs = get_irq_regs(); /* -- cgit v1.2.1 From ea5b41f9d595be354f7a50e56b28c2d72e6e88a5 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 9 Dec 2009 14:29:36 -0800 Subject: lockdep: Avoid out of bounds array reference in save_trace() ia64 found this the hard way (because we currently have a stub for save_stack_trace() that does nothing). But it would be a good idea to be cautious in case a real save_stack_trace() bailed out with an error before it set trace->nr_entries. Signed-off-by: Tony Luck Acked-by: Peter Zijlstra Cc: luming.yu@intel.com LKML-Reference: <4b2024d085302c2a2@agluck-desktop.sc.intel.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7a3ae56b3a7f..4f8df01dbe51 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -386,7 +386,8 @@ static int save_trace(struct stack_trace *trace) * complete trace that maxes out the entries provided will be reported * as incomplete, friggin useless */ - if (trace->entries[trace->nr_entries-1] == ULONG_MAX) + if (trace->nr_entries != 0 && + trace->entries[trace->nr_entries-1] == ULONG_MAX) trace->nr_entries--; trace->max_entries = trace->nr_entries; -- cgit v1.2.1 From 4ca3ef71f54655af98b66e8ff308a47a2a580a53 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 10 Dec 2009 09:25:53 +0100 Subject: sched: Fix build warning in get_update_sysctl_factor() Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0a60e8e9b094..3de3deab8095 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7033,7 +7033,7 @@ cpumask_var_t nohz_cpu_mask; */ static int get_update_sysctl_factor(void) { - unsigned int cpus = min(num_online_cpus(), 8); + unsigned int cpus = min_t(int, num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { -- cgit v1.2.1 From 41d2e494937715d3150e5c75d01f0e75ae899337 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Nov 2009 17:05:44 +0100 Subject: hrtimer: Tune hrtimer_interrupt hang logic The hrtimer_interrupt hang logic adjusts min_delta_ns based on the execution time of the hrtimer callbacks. This is error-prone for virtual machines, where a guest vcpu can be scheduled out during the execution of the callbacks (and the callbacks themselves can do operations that translate to blocking operations in the hypervisor), which in can lead to large min_delta_ns rendering the system unusable. Replace the current heuristics with something more reliable. Allow the interrupt code to try 3 times to catch up with the lost time. If that fails use the total time spent in the interrupt handler to defer the next timer interrupt so the system can catch up with other things which got delayed. Limit that deferment to 100ms. The retry events and the maximum time spent in the interrupt handler are recorded and exposed via /proc/timer_list Inspired by a patch from Marcelo. Reported-by: Michael Tokarev Signed-off-by: Thomas Gleixner Tested-by: Marcelo Tosatti Cc: kvm@vger.kernel.org --- kernel/hrtimer.c | 97 ++++++++++++++++++++++++++++-------------------- kernel/time/timer_list.c | 5 ++- 2 files changed, 61 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ede527708123..931a4d99bc55 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (expires.tv64 < 0) return -ETIME; - if (expires.tv64 >= expires_next->tv64) + if (expires.tv64 >= cpu_base->expires_next.tv64) + return 0; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) return 0; /* @@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, */ res = tick_program_event(expires, 0); if (!IS_ERR_VALUE(res)) - *expires_next = expires; + cpu_base->expires_next = expires; return res; } @@ -1217,30 +1226,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) #ifdef CONFIG_HIGH_RES_TIMERS -static int force_clock_reprogram; - -/* - * After 5 iteration's attempts, we consider that hrtimer_interrupt() - * is hanging, which could happen with something that slows the interrupt - * such as the tracing. Then we force the clock reprogramming for each future - * hrtimer interrupts to avoid infinite loops and use the min_delta_ns - * threshold that we will overwrite. - * The next tick event will be scheduled to 3 times we currently spend on - * hrtimer_interrupt(). This gives a good compromise, the cpus will spend - * 1/4 of their time to process the hrtimer interrupts. This is enough to - * let it running without serious starvation. - */ - -static inline void -hrtimer_interrupt_hanging(struct clock_event_device *dev, - ktime_t try_time) -{ - force_clock_reprogram = 1; - dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; - printk(KERN_WARNING "hrtimer: interrupt too slow, " - "forcing clock min delta to %llu ns\n", - (unsigned long long) dev->min_delta_ns); -} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1249,21 +1234,15 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; - ktime_t expires_next, now; - int nr_retries = 0; - int i; + ktime_t expires_next, now, entry_time, delta; + int i, retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; - retry: - /* 5 retries is enough to notice a hang */ - if (!(++nr_retries % 5)) - hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); - - now = ktime_get(); - + entry_time = now = ktime_get(); +retry: expires_next.tv64 = KTIME_MAX; spin_lock(&cpu_base->lock); @@ -1325,10 +1304,48 @@ void hrtimer_interrupt(struct clock_event_device *dev) spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, force_clock_reprogram)) - goto retry; + if (expires_next.tv64 == KTIME_MAX || + !tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + */ + now = ktime_get(); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta.tv64 > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", + ktime_to_ns(delta)); } /* diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 665c76edbf17..9d80db4747d4 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(expires_next); P(hres_active); P(nr_events); + P(nr_retries); + P(nr_hangs); + P_ns(max_hang_time); #endif #undef P #undef P_ns @@ -254,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.4\n"); + SEQ_printf(m, "Timer List Version: v0.5\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); -- cgit v1.2.1 From 5f201907dfe4ad42c44006ddfcec00ed12e59497 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 10 Dec 2009 10:56:29 +0100 Subject: hrtimer: move timer stats helper functions to hrtimer.c There is no reason to make timer_stats_hrtimer_set_start_info and friends visible to the rest of the kernel. So move all of them to hrtimer.c. Also make timer_stats_hrtimer_set_start_info a static inline function so it gets inlined and we avoid another function call. Based on a patch by Thomas Gleixner. Signed-off-by: Heiko Carstens LKML-Reference: <20091210095629.GC4144@osiris.boeblingen.de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 931a4d99bc55..d2f9239dc6ba 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -756,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ -#ifdef CONFIG_TIMER_STATS -void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { +#ifdef CONFIG_TIMER_STATS if (timer->start_site) return; - - timer->start_site = addr; + timer->start_site = __builtin_return_address(0); memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); timer->start_pid = current->pid; +#endif } + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; #endif +} + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (likely(!timer_stats_active)) + return; + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, 0); +#endif +} /* * Counterpart to lock_hrtimer_base above: -- cgit v1.2.1 From dfc12eb26a285df316be68a808af86964f3bff86 Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Thu, 10 Dec 2009 14:29:37 +0200 Subject: sched: Fix memory leak in two error corner cases If the second in each of these pairs of allocations fails, then the first one will not be freed in the error route out. Found by a static code analysis tool. Signed-off-by: Phil Carmody Acked-by: Peter Zijlstra LKML-Reference: <1260448177-28448-1-git-send-email-ext-phil.2.carmody@nokia.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3de3deab8095..36cc05a76947 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9855,13 +9855,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) se = kzalloc_node(sizeof(struct sched_entity), GFP_KERNEL, cpu_to_node(i)); if (!se) - goto err; + goto err_free_rq; init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; + err_free_rq: + kfree(cfs_rq); err: return 0; } @@ -9943,13 +9945,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) rt_se = kzalloc_node(sizeof(struct sched_rt_entity), GFP_KERNEL, cpu_to_node(i)); if (!rt_se) - goto err; + goto err_free_rq; init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; + err_free_rq: + kfree(rt_rq); err: return 0; } -- cgit v1.2.1 From 5e855db5d8fec44e6604eb245aa9077bbd3f0d05 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 10 Dec 2009 17:08:54 +0800 Subject: perf_event: Fix variable initialization in other codepaths Signed-off-by: Xiao Guangrong Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B20BAA6.7010609@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3a5d6c4786bb..d891ec4a8100 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4300,6 +4300,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; + sample.raw = NULL; sample.addr = bp->attr.bp_addr; if (!perf_exclude_event(bp, regs)) -- cgit v1.2.1 From b9889ed1ddeca5a3f3569c8de7354e9e97d803ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Dec 2009 20:32:39 +0100 Subject: sched: Remove forced2_migrations stats This build warning: kernel/sched.c: In function 'set_task_cpu': kernel/sched.c:2070: warning: unused variable 'old_rq' Made me realize that the forced2_migrations stat looks pretty pointless (and a misnomer) - remove it. Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ kernel/sched_debug.c | 2 -- 2 files changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 36cc05a76947..bc68037f3199 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2067,7 +2067,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); @@ -2075,10 +2074,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (old_cpu != new_cpu) { p->se.nr_migrations++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } @@ -2521,7 +2516,6 @@ static void __sched_fork(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 0fc5287fe80f..5ae24fc65d75 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -432,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_failed_migrations_running); P(se.nr_failed_migrations_hot); P(se.nr_forced_migrations); - P(se.nr_forced2_migrations); P(se.nr_wakeups); P(se.nr_wakeups_sync); P(se.nr_wakeups_migrate); @@ -508,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; p->se.nr_wakeups_migrate = 0; -- cgit v1.2.1 From 14d8c9f3c09e7fd7b9af80904289fe204f5b93c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 00:53:17 +0000 Subject: signal: Fix racy access to __task_cred in kill_pid_info_as_uid() kill_pid_info_as_uid() accesses __task_cred() without being in a RCU read side critical section. tasklist_lock is not protecting that when CONFIG_TREE_PREEMPT_RCU=y. Convert the whole tasklist_lock section to rcu and use lock_task_sighand to prevent the exit race. Signed-off-by: Thomas Gleixner LKML-Reference: <20091210004703.232302055@linutronix.de> Acked-by: Oleg Nesterov --- kernel/signal.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6b982f2cf524..73316568a69c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1175,11 +1175,12 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, int ret = -EINVAL; struct task_struct *p; const struct cred *pcred; + unsigned long flags; if (!valid_signal(sig)) return ret; - read_lock(&tasklist_lock); + rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; @@ -1196,14 +1197,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, ret = security_task_kill(p, info, sig, secid); if (ret) goto out_unlock; - if (sig && p->sighand) { - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = __send_signal(sig, info, p, 1, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + + if (sig) { + if (lock_task_sighand(p, &flags)) { + ret = __send_signal(sig, info, p, 1, 0); + unlock_task_sighand(p, &flags); + } else + ret = -ESRCH; } out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); -- cgit v1.2.1 From 7cf7db8df0b78076eafa4ead47559344ca7b7a43 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 00:53:21 +0000 Subject: signals: Fix more rcu assumptions 1) Remove the misleading comment in __sigqueue_alloc() which claims that holding a spinlock is equivalent to rcu_read_lock(). 2) Add a rcu_read_lock/unlock around the __task_cred() access in __sigqueue_alloc() This needs to be revisited to remove the remaining users of read_lock(&tasklist_lock) but that's outside the scope of this patch. Signed-off-by: Thomas Gleixner LKML-Reference: <20091210004703.269843657@linutronix.de> --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 73316568a69c..f67545f9394c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -218,13 +218,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi struct user_struct *user; /* - * We won't get problems with the target's UID changing under us - * because changing it requires RCU be used, and if t != current, the - * caller must be holding the RCU readlock (by way of a spinlock) and - * we use RCU protection here + * Protect access to @t credentials. This can go away when all + * callers hold rcu read lock. */ + rcu_read_lock(); user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); + rcu_read_unlock(); if (override_rlimit || atomic_read(&user->sigpending) <= -- cgit v1.2.1 From d4581a239a40319205762b76c01eb6363f277efa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 00:52:51 +0000 Subject: sys: Fix missing rcu protection for __task_cred() access commit c69e8d9 (CRED: Use RCU to access another task's creds and to release a task's own creds) added non rcu_read_lock() protected access to task creds of the target task in set_prio_one(). The comment above the function says: * - the caller must hold the RCU read lock The calling code in sys_setpriority does read_lock(&tasklist_lock) but not rcu_read_lock(). This works only when CONFIG_TREE_PREEMPT_RCU=n. With CONFIG_TREE_PREEMPT_RCU=y the rcu_callbacks can run in the tick interrupt when they see no read side critical section. There is another instance of __task_cred() in sys_setpriority() itself which is equally unprotected. Wrap the whole code section into a rcu read side critical section to fix this quick and dirty. Will be revisited in course of the read_lock(&tasklist_lock) -> rcu crusade. Oleg noted further: This also fixes another bug here. find_task_by_vpid() is not safe without rcu_read_lock(). I do not mean it is not safe to use the result, just find_pid_ns() by itself is not safe. Usually tasklist gives enough protection, but if copy_process() fails it calls free_pid() lockless and does call_rcu(delayed_put_pid(). This means, without rcu lock find_pid_ns() can't scan the hash table safely. Signed-off-by: Thomas Gleixner LKML-Reference: <20091210004703.029784964@linutronix.de> Acked-by: Paul E. McKenney --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 9968c5fb55b9..bc1dc61c31ed 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -163,6 +163,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) if (niceval > 19) niceval = 19; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -200,6 +201,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); out: return error; } -- cgit v1.2.1 From d954fbf0ff6b5fdfb32350e85a2f15d3db976506 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 23:46:52 +0100 Subject: tracing: Fix wrong usage of strstrip in trace_ksyms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit strstrip returns a pointer to the first non space character, but the code in parse_ksym_trace_str() ignores that. strstrip is now must_check and therefor we get the correct warning: kernel/trace/trace_ksym.c:294: warning: ignoring return value of ‘strstrip’, declared with attribute warn_unused_result We are really not interested in leading whitespace here. Fix that and cleanup the dozen kfree() exit pathes. Signed-off-by: Thomas Gleixner Cc: Xiao Guangrong Cc: Steven Rostedt --- kernel/trace/trace_ksym.c | 49 ++++++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index ddfa0fd43bc0..64e7a5bd6692 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -277,21 +277,20 @@ static ssize_t ksym_trace_filter_write(struct file *file, { struct trace_ksym *entry; struct hlist_node *node; - char *input_string, *ksymname = NULL; + char *buf, *input_string, *ksymname = NULL; unsigned long ksym_addr = 0; int ret, op, changed = 0; - input_string = kzalloc(count + 1, GFP_KERNEL); - if (!input_string) + buf = kzalloc(count + 1, GFP_KERNEL); + if (!buf) return -ENOMEM; - if (copy_from_user(input_string, buffer, count)) { - kfree(input_string); - return -EFAULT; - } - input_string[count] = '\0'; + ret = -EFAULT; + if (copy_from_user(buf, buffer, count)) + goto out; - strstrip(input_string); + buf[count] = '\0'; + input_string = strstrip(buf); /* * Clear all breakpoints if: @@ -302,15 +301,13 @@ static ssize_t ksym_trace_filter_write(struct file *file, if (!input_string[0] || !strcmp(input_string, "0") || !strcmp(input_string, "*:---")) { __ksym_trace_reset(); - kfree(input_string); - return count; + ret = 0; + goto out; } ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); - if (ret < 0) { - kfree(input_string); - return ret; - } + if (ret < 0) + goto out; mutex_lock(&ksym_tracer_mutex); @@ -321,7 +318,7 @@ static ssize_t ksym_trace_filter_write(struct file *file, if (entry->attr.bp_type != op) changed = 1; else - goto out; + goto out_unlock; break; } } @@ -336,28 +333,24 @@ static ssize_t ksym_trace_filter_write(struct file *file, if (IS_ERR(entry->ksym_hbp)) ret = PTR_ERR(entry->ksym_hbp); else - goto out; + goto out_unlock; } /* Error or "symbol:---" case: drop it */ ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry); - goto out; + goto out_unlock; } else { /* Check for malformed request: (4) */ - if (op == 0) - goto out; - ret = process_new_ksym_entry(ksymname, op, ksym_addr); + if (op) + ret = process_new_ksym_entry(ksymname, op, ksym_addr); } -out: +out_unlock: mutex_unlock(&ksym_tracer_mutex); - - kfree(input_string); - - if (!ret) - ret = count; - return ret; +out: + kfree(buf); + return !ret ? count : ret; } static const struct file_operations ksym_tracing_fops = { -- cgit v1.2.1 From 184210154b9aa570099183f6c062ac4eb11190b7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Dec 2009 22:54:27 -0500 Subject: ring-buffer: Use sync sched protection on ring buffer resizing There was a comment in the ring buffer code that says the calling layers should prevent tracing or reading of the ring buffer while resizing. I have discovered that the tracers do not honor this arrangement. This patch moves the disabling and synchronizing the ring buffer to a higher layer during resizing. This guarantees that no writes are occurring while the resize takes place. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1ca4956ab5e..0d64c51ab4df 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1193,9 +1193,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) struct list_head *p; unsigned i; - atomic_inc(&cpu_buffer->record_disabled); - synchronize_sched(); - spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); @@ -1214,9 +1211,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); - - atomic_dec(&cpu_buffer->record_disabled); - } static void @@ -1227,9 +1221,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - atomic_inc(&cpu_buffer->record_disabled); - synchronize_sched(); - spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); @@ -1245,8 +1236,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); - - atomic_dec(&cpu_buffer->record_disabled); } /** @@ -1254,11 +1243,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, * @buffer: the buffer to resize. * @size: the new size. * - * The tracer is responsible for making sure that the buffer is - * not being used while changing the size. - * Note: We may be able to change the above requirement by using - * RCU synchronizations. - * * Minimum size is 2 * BUF_PAGE_SIZE. * * Returns -1 on failure. @@ -1290,6 +1274,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (size == buffer_size) return size; + atomic_inc(&buffer->record_disabled); + + /* Make sure all writers are done with this buffer. */ + synchronize_sched(); + mutex_lock(&buffer->mutex); get_online_cpus(); @@ -1352,6 +1341,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); + return size; free_pages: @@ -1361,6 +1352,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); return -ENOMEM; /* @@ -1370,6 +1362,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) out_fail: put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); return -1; } EXPORT_SYMBOL_GPL(ring_buffer_resize); -- cgit v1.2.1 From dd7f59435782a02ceb6d16b9ce823dd3345d75ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Dec 2009 23:20:52 -0500 Subject: ring-buffer: Move resize integrity check under reader lock While using an application that does splice on the ftrace ring buffer at start up, I triggered an integrity check failure. Looking into this, I discovered that resizing the buffer performs an integrity check after the buffer is resized. This check unfortunately is preformed after it releases the reader lock. If a reader is reading the buffer it may cause the integrity check to trigger a false failure. This patch simply moves the integrity checker under the protection of the ring buffer reader lock. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0d64c51ab4df..eccb4cf1e998 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1208,9 +1208,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; rb_reset_cpu(cpu_buffer); - spin_unlock_irq(&cpu_buffer->reader_lock); - rb_check_pages(cpu_buffer); + + spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -1233,9 +1233,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); - spin_unlock_irq(&cpu_buffer->reader_lock); - rb_check_pages(cpu_buffer); + + spin_unlock_irq(&cpu_buffer->reader_lock); } /** -- cgit v1.2.1 From bb6eddf7676e1c1f3e637aa93c5224488d99036f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 15:35:10 +0100 Subject: clockevents: Prevent clockevent_devices list corruption on cpu hotplug Xiaotian Feng triggered a list corruption in the clock events list on CPU hotplug and debugged the root cause. If a CPU registers more than one per cpu clock event device, then only the active clock event device is removed on CPU_DEAD. The unused devices are kept in the clock events device list. On CPU up the clock event devices are registered again, which means that we list_add an already enqueued list_head. That results in list corruption. Resolve this by removing all devices which are associated to the dead CPU on CPU_DEAD. Reported-by: Xiaotian Feng Signed-off-by: Thomas Gleixner Tested-by: Xiaotian Feng Cc: stable@kernel.org --- kernel/time/clockevents.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 20a8920029ee..91db2e33d86a 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -238,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old, */ void clockevents_notify(unsigned long reason, void *arg) { - struct list_head *node, *tmp; + struct clock_event_device *dev, *tmp; unsigned long flags; + int cpu; spin_lock_irqsave(&clockevents_lock, flags); clockevents_do_notify(reason, arg); @@ -250,8 +251,19 @@ void clockevents_notify(unsigned long reason, void *arg) * Unregister the clock event devices which were * released from the users in the notify chain. */ - list_for_each_safe(node, tmp, &clockevents_released) - list_del(node); + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + cpu = *((int *)arg); + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1) { + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + list_del(&dev->list); + } + } break; default: break; -- cgit v1.2.1 From 84667d4849b0e0a939a76f9f62d45fa3b4d59692 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:13 -0600 Subject: kgdb: Read buffer overflow Roel Kluin reported an error found with Parfait. Where we want to ensure that that kgdb_info[-1] never gets accessed. Also check to ensure any negative tid does not exceed the size of the shadow CPU array, else report critical debug context because it is an internal kgdb failure. Reported-by: Roel Kluin Signed-off-by: Jason Wessel --- kernel/kgdb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 7d7014634022..29357a9ccfb2 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -541,12 +541,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid) */ if (tid == 0 || tid == -1) tid = -atomic_read(&kgdb_active) - 2; - if (tid < 0) { + if (tid < -1 && tid > -NR_CPUS - 2) { if (kgdb_info[-tid - 2].task) return kgdb_info[-tid - 2].task; else return idle_task(-tid - 2); } + if (tid <= 0) { + printk(KERN_ERR "KGDB: Internal thread select error\n"); + dump_stack(); + return NULL; + } /* * find_task_by_pid_ns() does not take the tasklist lock anymore -- cgit v1.2.1 From 028e7b175970be8fca58bfd7d61cc375babe40b7 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:17 -0600 Subject: kgdb: allow for cpu switch when single stepping The kgdb core should not assume that a single step operation of a kernel thread will complete on the same CPU. The single step flag is set at the "thread" level and it is possible in a multi cpu system that a kernel thread can get scheduled on another cpu the next time it is run. As a further safety net in case a slave cpu is hung, the debug master cpu will try 100 times before giving up and assuming control of the slave cpus is no longer possible. It is more useful to be able to get some information out of kgdb instead of spinning forever. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 29357a9ccfb2..ca21fe98e8de 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread; struct task_struct *kgdb_contthread; int kgdb_single_step; +pid_t kgdb_sstep_pid; /* Our I/O buffers. */ static char remcom_in_buffer[BUFMAX]; @@ -1400,6 +1401,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; unsigned long flags; + int sstep_tries = 100; int error = 0; int i, cpu; @@ -1430,13 +1432,14 @@ acquirelock: cpu_relax(); /* - * Do not start the debugger connection on this CPU if the last - * instance of the exception handler wanted to come into the - * debugger on a different CPU via a single step + * For single stepping, try to only enter on the processor + * that was single stepping. To gaurd against a deadlock, the + * kernel will only try for the value of sstep_tries before + * giving up and continuing on. */ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && - atomic_read(&kgdb_cpu_doing_single_step) != cpu) { - + (kgdb_info[cpu].task && + kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); touch_softlockup_watchdog(); clocksource_touch_watchdog(); @@ -1529,6 +1532,13 @@ acquirelock: } kgdb_restore: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step); + if (kgdb_info[sstep_cpu].task) + kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid; + else + kgdb_sstep_pid = 0; + } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); touch_softlockup_watchdog(); -- cgit v1.2.1 From d625e9c0d706eb43afbf52634d5cecacae1d57cc Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 27 Apr 2009 13:20:21 -0500 Subject: kgdb: continue and warn on signal passing from gdb On some architectures for the segv trap, gdb wants to pass the signal back on continue. For kgdb this is not the default behavior, because it can cause the kernel to crash if you arbitrarily pass back a exception outside of kgdb. Instead of causing instability, pass a message back to gdb about the supported kgdb signal passing and execute a standard kgdb continue operation. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index ca21fe98e8de..8584eac55e30 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1210,8 +1210,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks) return 1; } else { - error_packet(remcom_out_buffer, -EINVAL); - return 0; + kgdb_msg_write("KGDB only knows signal 9 (pass)" + " and 15 (pass and disconnect)\n" + "Executing a continue without signal passing\n", 0); + remcom_in_buffer[0] = 'c'; } /* Indicate fall through */ -- cgit v1.2.1 From 7f8b7ed6f825c729332b8190aca55c6bf95b158e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:20 -0600 Subject: kgdb: Always process the whole breakpoint list on activate or deactivate This patch fixes 2 edge cases in using kgdb in conjunction with gdb. 1) kgdb_deactivate_sw_breakpoints() should process the entire array of breakpoints. The failure to do so results in breakpoints that you cannot remove, because a break point can only be removed if its state flag is set to BP_SET. The easy way to duplicate this problem is to plant a break point in a kernel module and then unload the kernel module. 2) kgdb_activate_sw_breakpoints() should process the entire array of breakpoints. The failure to do so results in missed breakpoints when a breakpoint cannot be activated. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 8584eac55e30..2eb517e23514 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -625,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) static int kgdb_activate_sw_breakpoints(void) { unsigned long addr; - int error = 0; + int error; + int ret = 0; int i; for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { @@ -635,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void) addr = kgdb_break[i].bpt_addr; error = kgdb_arch_set_breakpoint(addr, kgdb_break[i].saved_instr); - if (error) - return error; + if (error) { + ret = error; + printk(KERN_INFO "KGDB: BP install failed: %lx", addr); + continue; + } kgdb_flush_swbreak_addr(addr); kgdb_break[i].state = BP_ACTIVE; } - return 0; + return ret; } static int kgdb_set_sw_break(unsigned long addr) @@ -688,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr) static int kgdb_deactivate_sw_breakpoints(void) { unsigned long addr; - int error = 0; + int error; + int ret = 0; int i; for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { @@ -697,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void) addr = kgdb_break[i].bpt_addr; error = kgdb_arch_remove_breakpoint(addr, kgdb_break[i].saved_instr); - if (error) - return error; + if (error) { + printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); + ret = error; + } kgdb_flush_swbreak_addr(addr); kgdb_break[i].state = BP_SET; } - return 0; + return ret; } static int kgdb_remove_sw_break(unsigned long addr) -- cgit v1.2.1 From 03889384cee7a198a79447c1ea6aca2c8e54d155 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Dec 2009 09:48:22 -0500 Subject: tracing: Add trace_dump_stack() I've been asked a few times about how to find out what is calling some location in the kernel. One way is to use dynamic function tracing and implement the func_stack_trace. But this only finds out who is calling a particular function. It does not tell you who is calling that function and entering a specific if conditional. I have myself implemented a quick version of trace_dump_stack() for this purpose a few times, and just needed it now. This is when I realized that this would be a good tool to have in the kernel like trace_printk(). Using trace_dump_stack() is similar to dump_stack() except that it writes to the trace buffer instead and can be used in critical locations. For example: @@ -5485,8 +5485,12 @@ need_resched_nonpreemptible: if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - else + else { deactivate_task(rq, prev, 1); + trace_printk("Deactivating task %s:%d\n", + prev->comm, prev->pid); + trace_dump_stack(); + } switch_count = &prev->nvcsw; } Produces: <...>-3249 [001] 296.105269: schedule: Deactivating task ntpd:3249 <...>-3249 [001] 296.105270: => schedule => schedule_hrtimeout_range => poll_schedule_timeout => do_select => core_sys_select => sys_select => system_call_fastpath Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88bd9ae2a9ed..f531301b7a3b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1151,6 +1151,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, __ftrace_trace_stack(tr->buffer, flags, skip, pc); } +/** + * trace_dump_stack - record a stack back trace in the trace buffer + */ +void trace_dump_stack(void) +{ + unsigned long flags; + + if (tracing_disabled || tracing_selftest_running) + return 0; + + local_save_flags(flags); + + /* skipping 3 traces, seems to get us at the caller of this function */ + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); +} + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { -- cgit v1.2.1 From cc51a0fca66658ea710db566ba17e80e3f7d4957 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Dec 2009 11:54:51 -0500 Subject: tracing: Add stack trace to irqsoff tracer The irqsoff and friends tracers help in finding causes of latency in the kernel. The also work with the function tracer to show what was happening when interrupts or preemption are disabled. But the function tracer has a bit of an overhead and can cause exagerated readings. Currently, when tracing with /proc/sys/kernel/ftrace_enabled = 0, where the function tracer is disabled, the information that is provided can end up being useless. For example, a 2 and a half millisecond latency only showed: # tracer: preemptirqsoff # # preemptirqsoff latency trace v1.1.5 on 2.6.32 # -------------------------------------------------------------------- # latency: 2463 us, #4/4, CPU#2 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: -4242 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # => started at: _spin_lock_irqsave # => ended at: remove_wait_queue # # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| /_--=> lock-depth # |||||/ delay # cmd pid |||||| time | caller # \ / |||||| \ | / hackbenc-4242 2d.... 0us!: trace_hardirqs_off <-_spin_lock_irqsave hackbenc-4242 2...1. 2463us+: _spin_unlock_irqrestore <-remove_wait_queue hackbenc-4242 2...1. 2466us : trace_preempt_on <-remove_wait_queue The above lets us know that hackbench with pid 2463 grabbed a spin lock somewhere and enabled preemption at remove_wait_queue. This helps a little but where this actually happened is not informative. This patch adds the stack dump to the end of the irqsoff tracer. This provides the following output: hackbenc-4242 2d.... 0us!: trace_hardirqs_off <-_spin_lock_irqsave hackbenc-4242 2...1. 2463us+: _spin_unlock_irqrestore <-remove_wait_queue hackbenc-4242 2...1. 2466us : trace_preempt_on <-remove_wait_queue hackbenc-4242 2...1. 2467us : => sub_preempt_count => _spin_unlock_irqrestore => remove_wait_queue => free_poll_entry => poll_freewait => do_sys_poll => sys_poll => system_call_fastpath Now we see that the culprit of this latency was the free_poll_entry code. Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 3aa7eaa2114c..2974bc7538c7 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr, goto out_unlock; trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + /* Skip 5 functions to get to the irq/preempt enable function */ + __trace_stack(tr, flags, 5, pc); if (data->critical_sequence != max_sequence) goto out_unlock; -- cgit v1.2.1 From 5ec93d1154fd1e269162398f8e70efc7e004485d Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 30 Nov 2009 13:18:45 +0000 Subject: tty: Move the leader test in disassociate There are two call points, both want to check that tty->signal->leader is set. Move the test into disassociate_ctty() as that will make locking changes easier in a bit Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1143012951e9..6f50ef55a6f3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -971,7 +971,7 @@ NORET_TYPE void do_exit(long code) exit_thread(); cgroup_exit(tsk, 1); - if (group_dead && tsk->signal->leader) + if (group_dead) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); -- cgit v1.2.1 From 01fc0ac198eabcbf460e1ed058860a935b6c2c9a Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 19 Apr 2009 21:57:19 +0200 Subject: kbuild: move bounds.h to include/generated Signed-off-by: Sam Ravnborg Cc: Al Viro Signed-off-by: Michal Marek --- kernel/bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index 3c5301381837..98a51f26c136 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -12,7 +12,7 @@ void foo(void) { - /* The enum constants to put into include/linux/bounds.h */ + /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); /* End of constants */ -- cgit v1.2.1 From 273b281fa22c293963ee3e6eec418f5dda2dbc83 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 18 Oct 2009 00:52:28 +0200 Subject: kbuild: move utsrelease.h to include/generated Fix up all users of utsrelease.h Signed-off-by: Sam Ravnborg Signed-off-by: Michal Marek --- kernel/kexec.c | 2 +- kernel/trace/trace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index f336e2107f98..83f54e2a6eed 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88bd9ae2a9ed..bfb1b64bfa9d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -12,7 +12,7 @@ * Copyright (C) 2004 William Lee Irwin III */ #include -#include +#include #include #include #include -- cgit v1.2.1 From 7539a3b3d1f892dd97eaf094134d7de55c13befe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 13 Dec 2009 00:07:30 +0100 Subject: sched: Make wakeup side and atomic variants of completion API irq safe Alan Stern noticed that all the wakeup side (and atomic) variants of the completion APIs should be irq safe, but the newly introduced completion_done() and try_wait_for_completion() aren't. The use of the irq unsafe variants in IRQ contexts can cause crashes/hangs. Fix the problem by making them use spin_lock_irqsave() and spin_lock_irqrestore(). Reported-by: Alan Stern Signed-off-by: Rafael J. Wysocki Cc: Linus Torvalds Cc: Zhang Rui Cc: pm list Cc: Peter Zijlstra Cc: David Chinner Cc: Lachlan McIlroy LKML-Reference: <200912130007.30541.rjw@sisk.pl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ff39cadf621e..8b3532f262d7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5908,14 +5908,15 @@ EXPORT_SYMBOL(wait_for_completion_killable); */ bool try_wait_for_completion(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -5930,12 +5931,13 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); -- cgit v1.2.1 From 663997d417330a59a566452f52cfa04c8ffd190b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 12 Dec 2009 13:57:27 -0800 Subject: sched: Use pr_fmt() and pr_() - Convert printk(KERN_ to pr_ (not KERN_DEBUG) - Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Coalesce long format strings - Add missing \n to "ERROR: !SD_LOAD_BALANCE domain has parent" Signed-off-by: Joe Perches Cc: Peter Zijlstra LKML-Reference: <1260655047.2637.7.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- kernel/sched.c | 94 ++++++++++++++++++++++--------------------------- kernel/sched_idletask.c | 2 +- 2 files changed, 43 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8b3532f262d7..258c73c6a2f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -26,6 +26,8 @@ * Thomas Gleixner, Mike Kravetz */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -5337,8 +5339,8 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); print_modules(); @@ -6906,23 +6908,23 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, + pr_info("%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - printk(KERN_CONT " running "); + pr_cont(" running "); else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); + pr_cont(" %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); + pr_cont(" %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + pr_cont("%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), task_pid_nr(p->real_parent), (unsigned long)task_thread_info(p)->flags); @@ -6934,11 +6936,9 @@ void show_state_filter(unsigned long state_filter) struct task_struct *g, *p; #if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); + pr_info(" task PC stack pid father\n"); #else - printk(KERN_INFO - " task PC stack pid father\n"); + pr_info(" task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -7296,9 +7296,8 @@ again: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); + pr_info("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } } @@ -7805,48 +7804,44 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_DEBUG "%*s domain %d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); + pr_cont("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + pr_err("ERROR: !SD_LOAD_BALANCE domain has parent\n"); return -1; } - printk(KERN_CONT "span %s level %s\n", str, sd->name); + pr_cont("span %s level %s\n", str, sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); + pr_err("ERROR: domain->span does not contain CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); + pr_err("ERROR: domain->groups does not contain CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); + pr_cont("\n"); + pr_err("ERROR: group is NULL\n"); break; } if (!group->cpu_power) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + pr_cont("\n"); + pr_err("ERROR: domain->cpu_power not set\n"); break; } if (!cpumask_weight(sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); + pr_cont("\n"); + pr_err("ERROR: empty group\n"); break; } if (cpumask_intersects(groupmask, sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); + pr_cont("\n"); + pr_err("ERROR: repeated CPUs\n"); break; } @@ -7854,23 +7849,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - printk(KERN_CONT " %s", str); + pr_cont(" %s", str); if (group->cpu_power != SCHED_LOAD_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + pr_cont(" (cpu_power = %d)", group->cpu_power); } group = group->next; } while (group != sd->groups); - printk(KERN_CONT "\n"); + pr_cont("\n"); if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + pr_err("ERROR: groups don't span domain->span\n"); if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); + pr_err("ERROR: parent span is not a superset of domain->span\n"); return 0; } @@ -8426,8 +8419,7 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); + pr_warning("Can not alloc domain group for node %d\n", num); return -ENOMEM; } d->sched_group_nodes[num] = sg; @@ -8456,8 +8448,8 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); + pr_warning("Can not alloc domain group for node %d\n", + j); return -ENOMEM; } sg->cpu_power = 0; @@ -8685,7 +8677,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, d->sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); + pr_warning("Can not alloc sched group node list\n"); return sa_notcovered; } sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; @@ -8702,7 +8694,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_send_covered; d->rd = alloc_rootdomain(); if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); + pr_warning("Cannot alloc root domain\n"); return sa_tmpmask; } return sa_rootdomain; @@ -9684,13 +9676,11 @@ void __might_sleep(char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 33d5384a73a8..b810e22772d5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -35,7 +35,7 @@ static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + pr_err("bad: scheduling from the idle thread!\n"); dump_stack(); spin_lock_irq(&rq->lock); } -- cgit v1.2.1 From 87d9b4e1c52867a45331a9a5495f6448e0c68b23 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:14:20 +0800 Subject: tracing: Extract duplicate ftrace_raw_init_event_foo() Use a generic trace_event_raw_init() function for all event's raw_init callbacks (but kprobes) instead of defining the same version for each of these. This shrinks the kernel code: text data bss dec hex filename 5355293 1961928 7103260 14420481 dc0a01 vmlinux.o.old 5346802 1961864 7103260 14411926 dbe896 vmlinux.o raw_init can't be removed, because ftrace events and kprobe events use different raw_init callbacks. Though it's possible to totally remove raw_init, I choose to leave it as it is for now. Signed-off-by: Li Zefan Acked-by: Steven Rostedt Cc: Jason Baron Cc: Ingo Molnar LKML-Reference: <4B1DC48C.7080603@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1d18315dc836..8ed66e0d476b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -105,6 +105,20 @@ void trace_destroy_fields(struct ftrace_event_call *call) } } +int trace_event_raw_init(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(call->event); + if (!id) + return -ENODEV; + call->id = id; + INIT_LIST_HEAD(&call->fields); + + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_raw_init); + static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { -- cgit v1.2.1 From 614a71a26ba3d97e9fa85649db69a682b78e407d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:14:36 +0800 Subject: tracing: Pull up calls to trace_define_common_fields() Call trace_define_common_fields() in event_create_dir() only. This avoids trace events to handle it from their define_fields callbacks and shrinks the kernel code size: text data bss dec hex filename 5346802 1961864 7103260 14411926 dbe896 vmlinux.o.old 5345151 1961864 7103260 14410275 dbe223 vmlinux.o Signed-off-by: Li Zefan Acked-by: Steven Rostedt Cc: Ingo Molnar Cc: Jason Baron Cc: Masami Hiramatsu LKML-Reference: <4B1DC49C.8000107@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 7 ++++--- kernel/trace/trace_export.c | 4 ---- kernel/trace/trace_kprobe.c | 8 -------- kernel/trace/trace_syscalls.c | 8 -------- 4 files changed, 4 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8ed66e0d476b..97b0b3aa166d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; -int trace_define_common_fields(struct ftrace_event_call *call) +static int trace_define_common_fields(struct ftrace_event_call *call) { int ret; struct trace_entry ent; @@ -91,7 +91,6 @@ int trace_define_common_fields(struct ftrace_event_call *call) return ret; } -EXPORT_SYMBOL_GPL(trace_define_common_fields); void trace_destroy_fields(struct ftrace_event_call *call) { @@ -927,7 +926,9 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, id); if (call->define_fields) { - ret = call->define_fields(call); + ret = trace_define_common_fields(call); + if (!ret) + ret = call->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index dff8c84ddf17..458e5bfe26d0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -184,10 +184,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ struct struct_name field; \ int ret; \ \ - ret = trace_define_common_fields(event_call); \ - if (ret) \ - return ret; \ - \ tstruct; \ \ return ret; \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aff5f80b59b8..e3c80e925896 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1113,10 +1113,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) struct kprobe_trace_entry field; struct trace_probe *tp = (struct trace_probe *)event_call->data; - ret = trace_define_common_fields(event_call); - if (!ret) - return ret; - DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ @@ -1131,10 +1127,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) struct kretprobe_trace_entry field; struct trace_probe *tp = (struct trace_probe *)event_call->data; - ret = trace_define_common_fields(event_call); - if (!ret) - return ret; - DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 57501d90096a..b957edd0ca3b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -217,10 +217,6 @@ int syscall_enter_define_fields(struct ftrace_event_call *call) int i; int offset = offsetof(typeof(trace), args); - ret = trace_define_common_fields(call); - if (ret) - return ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); if (ret) return ret; @@ -241,10 +237,6 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) struct syscall_trace_exit trace; int ret; - ret = trace_define_common_fields(call); - if (ret) - return ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); if (ret) return ret; -- cgit v1.2.1 From 3b8e4273814a7f9e9a74ece517d9206fea919aaa Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:14:52 +0800 Subject: tracing: Move a printk out of ftrace_raw_reg_event_foo() Move the printk from each ftrace_raw_reg_event_foo() to its caller ftrace_event_enable_disable(). This avoids each regfunc trace event callbacks to handle a same error report that can be carried from the caller. See how much space this saves: text data bss dec hex filename 5345151 1961864 7103260 14410275 dbe223 vmlinux.o.old 5331487 1961864 7103260 14396611 dbacc3 vmlinux.o Signed-off-by: Li Zefan Acked-by: Steven Rostedt Cc: Jason Baron LKML-Reference: <4B1DC4AC.802@cn.fujitsu.com> [start cmdline record before calling regfunc to avoid lost window of pid to comm resolution] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 20 +++++++++++++++----- kernel/trace/trace_syscalls.c | 10 ++-------- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 97b0b3aa166d..189b09baf4fb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -118,9 +118,11 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -static void ftrace_event_enable_disable(struct ftrace_event_call *call, +static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { + int ret = 0; + switch (enable) { case 0: if (call->enabled) { @@ -131,12 +133,20 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, break; case 1: if (!call->enabled) { - call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(call); + ret = call->regfunc(call); + if (ret) { + tracing_stop_cmdline_record(); + pr_info("event trace: Could not enable event " + "%s\n", call->name); + break; + } + call->enabled = 1; } break; } + + return ret; } static void ftrace_clear_events(void) @@ -415,7 +425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 0: case 1: mutex_lock(&event_mutex); - ftrace_event_enable_disable(call, val); + ret = ftrace_event_enable_disable(call, val); mutex_unlock(&event_mutex); break; @@ -425,7 +435,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; - return cnt; + return ret ? ret : cnt; } static ssize_t diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b957edd0ca3b..75289f372dd2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -325,10 +325,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter); - if (ret) { - pr_info("event trace: Could not activate" - "syscall entry trace point"); - } else { + if (!ret) { set_bit(num, enabled_enter_syscalls); sys_refcount_enter++; } @@ -362,10 +359,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit); - if (ret) { - pr_info("event trace: Could not activate" - "syscall exit trace point"); - } else { + if (!ret) { set_bit(num, enabled_exit_syscalls); sys_refcount_exit++; } -- cgit v1.2.1 From 311d16da575f53c3367099579736c1d233efe0dc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:15:11 +0800 Subject: ftrace: Return EINVAL when writing invalid val to set_ftrace_filter Currently it doesn't warn user on invald value: # echo nonexist_symbol > set_ftrace_filter or: # echo 'nonexist_symbol:mod:fuse' > set_ftrace_filter Better make it return failure. Signed-off-by: Li Zefan Acked-by: Steven Rostedt LKML-Reference: <4B1DC4BF.2070003@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e51a1bcb7bed..08a3fb5b3187 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1724,7 +1724,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) return ftrace_match(str, regex, len, type); } -static void ftrace_match_records(char *buff, int len, int enable) +static int ftrace_match_records(char *buff, int len, int enable) { unsigned int search_len; struct ftrace_page *pg; @@ -1733,6 +1733,7 @@ static void ftrace_match_records(char *buff, int len, int enable) char *search; int type; int not; + int found = 0; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; type = filter_parse_regex(buff, len, &search, ¬); @@ -1750,6 +1751,7 @@ static void ftrace_match_records(char *buff, int len, int enable) rec->flags &= ~flag; else rec->flags |= flag; + found = 1; } /* * Only enable filtering if we have a function that @@ -1759,6 +1761,8 @@ static void ftrace_match_records(char *buff, int len, int enable) ftrace_filtered = 1; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); + + return found; } static int @@ -1780,7 +1784,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, return 1; } -static void ftrace_match_module_records(char *buff, char *mod, int enable) +static int ftrace_match_module_records(char *buff, char *mod, int enable) { unsigned search_len = 0; struct ftrace_page *pg; @@ -1789,6 +1793,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) char *search = buff; unsigned long flag; int not = 0; + int found = 0; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; @@ -1819,12 +1824,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) rec->flags &= ~flag; else rec->flags |= flag; + found = 1; } if (enable && (rec->flags & FTRACE_FL_FILTER)) ftrace_filtered = 1; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); + + return found; } /* @@ -1853,8 +1861,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return -EINVAL; - ftrace_match_module_records(func, mod, enable); - return 0; + if (ftrace_match_module_records(func, mod, enable)) + return 0; + return -EINVAL; } static struct ftrace_func_command ftrace_mod_cmd = { @@ -2151,8 +2160,9 @@ static int ftrace_process_regex(char *buff, int len, int enable) func = strsep(&next, ":"); if (!next) { - ftrace_match_records(func, len, enable); - return 0; + if (ftrace_match_records(func, len, enable)) + return 0; + return ret; } /* command found */ -- cgit v1.2.1 From 313254a9400d388b46150c0f355e216418a2f598 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:15:30 +0800 Subject: ftrace: Call trace_parser_clear() properly I found a weird behavior: # echo 'fuse:*' > set_ftrace_filter bash: echo: write error: Invalid argument # cat set_ftrace_filter fuse_dev_fasync fuse_dev_poll fuse_copy_do We should call trace_parser_clear() no matter ftrace_process_regex() returns 0 or -errno, otherwise we will actually take the unaccepted records from ftrace_regex_release(). Signed-off-by: Li Zefan Acked-by: Steven Rostedt LKML-Reference: <4B1DC4D2.3000406@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 08a3fb5b3187..ff8aecdc6dd6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2208,10 +2208,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, !trace_parser_cont(parser)) { ret = ftrace_process_regex(parser->buffer, parser->idx, enable); + trace_parser_clear(parser); if (ret) goto out_unlock; - - trace_parser_clear(parser); } ret = read; -- cgit v1.2.1 From 91baf6285be7282cfa487de92f836c50749dffb9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:15:45 +0800 Subject: function-graph: Allow writing the same val to set_graph_function # echo 'do_open' > set_graph_function # echo 'do_open' >> set_graph_function bash: echo: write error: Invalid argument Make it valid to write the same value to set_graph_function, which is consistent with set_ftrace_filter interface. Signed-off-by: Li Zefan Acked-by: Steven Rostedt LKML-reference: <4B1DC4E1.1060303@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ff8aecdc6dd6..7968762c8167 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2552,10 +2552,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) exists = true; break; } - if (!exists) { + if (!exists) array[(*idx)++] = rec->ip; - found = 1; - } + found = 1; } } while_for_each_ftrace_rec(); -- cgit v1.2.1 From fdb372ed4cadbfe9dbba0e932a77d0523682e690 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:15:59 +0800 Subject: tracing: Use seq file for trace_options Code simplification for reading trace_options. Signed-off-by: Li Zefan Acked-by: Steven Rostedt LKML-reference: <4B1DC4EF.3090106@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 60 +++++++++++++++------------------------------------- 1 file changed, 17 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88bd9ae2a9ed..a6c41cc63285 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2316,67 +2316,32 @@ static const struct file_operations tracing_cpumask_fops = { .write = tracing_cpumask_write, }; -static ssize_t -tracing_trace_options_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; u32 tracer_flags; - int len = 0; - char *buf; - int r = 0; int i; - - /* calculate max size */ - for (i = 0; trace_options[i]; i++) { - len += strlen(trace_options[i]); - len += 3; /* "no" and newline */ - } - mutex_lock(&trace_types_lock); tracer_flags = current_trace->flags->val; trace_opts = current_trace->flags->opts; - /* - * Increase the size with names of options specific - * of the current tracer. - */ - for (i = 0; trace_opts[i].name; i++) { - len += strlen(trace_opts[i].name); - len += 3; /* "no" and newline */ - } - - /* +1 for \0 */ - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) { - mutex_unlock(&trace_types_lock); - return -ENOMEM; - } - for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) - r += sprintf(buf + r, "%s\n", trace_options[i]); + seq_printf(m, "%s\n", trace_options[i]); else - r += sprintf(buf + r, "no%s\n", trace_options[i]); + seq_printf(m, "no%s\n", trace_options[i]); } for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) - r += sprintf(buf + r, "%s\n", - trace_opts[i].name); + seq_printf(m, "%s\n", trace_opts[i].name); else - r += sprintf(buf + r, "no%s\n", - trace_opts[i].name); + seq_printf(m, "no%s\n", trace_opts[i].name); } mutex_unlock(&trace_types_lock); - WARN_ON(r >= len + 1); - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - kfree(buf); - return r; + return 0; } /* Try to assign a tracer specific option */ @@ -2471,9 +2436,18 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return cnt; } +static int tracing_trace_options_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_trace_options_show, NULL); +} + static const struct file_operations tracing_iter_fops = { - .open = tracing_open_generic, - .read = tracing_trace_options_read, + .open = tracing_trace_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, .write = tracing_trace_options_write, }; -- cgit v1.2.1 From 13f16d209161c95e92aef40e350cc6cf56ac440b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:16:11 +0800 Subject: tracing: Use seq file for trace_clock The buffer for the output is as small as 64 bytes, so it'll overflow if we add more clock type. Use seq file instead. Signed-off-by: Li Zefan Acked-by: Steven Rostedt LKML-Reference: <4B1DC4FB.5030407@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a6c41cc63285..886268e0d8ee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3366,21 +3366,18 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return cnt; } -static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_clock_show(struct seq_file *m, void *v) { - char buf[64]; - int bufiter = 0; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) - bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, + seq_printf(m, "%s%s%s%s", i ? " " : "", i == trace_clock_id ? "[" : "", trace_clocks[i].name, i == trace_clock_id ? "]" : ""); - bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); + seq_putc(m, '\n'); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); + return 0; } static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, @@ -3422,6 +3419,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, return cnt; } +static int tracing_clock_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_clock_show, NULL); +} + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -3460,8 +3464,10 @@ static const struct file_operations tracing_mark_fops = { }; static const struct file_operations trace_clock_fops = { - .open = tracing_open_generic, - .read = tracing_clock_read, + .open = tracing_clock_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, .write = tracing_clock_write, }; -- cgit v1.2.1 From 2cbafd68b826f8e0471875cf33cdfb8a1478aef1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:16:26 +0800 Subject: tracing: Remove useless trace option Since commit 4d9493c90f8e6e1b164aede3814010a290161abb ("ftrace: remove add-hoc code"), option "sched-tree" has become useless. Signed-off-by: Li Zefan Acked-by: Steven Rostedt LKML-Reference: <4B1DC50A.7040402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 1 - kernel/trace/trace.h | 23 +++++++++++------------ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 886268e0d8ee..898409d60422 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -313,7 +313,6 @@ static const char *trace_options[] = { "bin", "block", "stacktrace", - "sched-tree", "trace_printk", "ftrace_preempt", "branch", diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7fa33cab6962..1b18cb240c16 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -597,18 +597,17 @@ enum trace_iterator_flags { TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, - TRACE_ITER_SCHED_TREE = 0x200, - TRACE_ITER_PRINTK = 0x400, - TRACE_ITER_PREEMPTONLY = 0x800, - TRACE_ITER_BRANCH = 0x1000, - TRACE_ITER_ANNOTATE = 0x2000, - TRACE_ITER_USERSTACKTRACE = 0x4000, - TRACE_ITER_SYM_USEROBJ = 0x8000, - TRACE_ITER_PRINTK_MSGONLY = 0x10000, - TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT = 0x40000, - TRACE_ITER_SLEEP_TIME = 0x80000, - TRACE_ITER_GRAPH_TIME = 0x100000, + TRACE_ITER_PRINTK = 0x200, + TRACE_ITER_PREEMPTONLY = 0x400, + TRACE_ITER_BRANCH = 0x800, + TRACE_ITER_ANNOTATE = 0x1000, + TRACE_ITER_USERSTACKTRACE = 0x2000, + TRACE_ITER_SYM_USEROBJ = 0x4000, + TRACE_ITER_PRINTK_MSGONLY = 0x8000, + TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT = 0x20000, + TRACE_ITER_SLEEP_TIME = 0x40000, + TRACE_ITER_GRAPH_TIME = 0x80000, }; /* -- cgit v1.2.1 From 8d18eaaff5acaa58369be342c86e607643ce10c7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:17:06 +0800 Subject: tracing: Simplify trace_option_write() - remove duplicate code inside trace_options_write() - extract duplicate code in trace_options_write() and set_tracer_option() Signed-off-by: Li Zefan Acked-by: Steven Rostedt LKML-Reference: <4B1DC532.9010802@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 85 +++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 898409d60422..05076008f371 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2343,38 +2343,39 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) return 0; } +static int __set_tracer_option(struct tracer *trace, + struct tracer_flags *tracer_flags, + struct tracer_opt *opts, int neg) +{ + int ret; + + ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + if (ret) + return ret; + + if (neg) + tracer_flags->val &= ~opts->bit; + else + tracer_flags->val |= opts->bit; + return 0; +} + /* Try to assign a tracer specific option */ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) { struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; - int ret = 0, i = 0; - int len; + int i; for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; - len = strlen(opts->name); - if (strncmp(cmp, opts->name, len) == 0) { - ret = trace->set_flag(tracer_flags->val, - opts->bit, !neg); - break; - } + if (strcmp(cmp, opts->name) == 0) + return __set_tracer_option(trace, trace->flags, + opts, neg); } - /* Not found */ - if (!tracer_flags->opts[i].name) - return -EINVAL; - /* Refused to handle */ - if (ret) - return ret; - - if (neg) - tracer_flags->val &= ~opts->bit; - else - tracer_flags->val |= opts->bit; - - return 0; + return -EINVAL; } static void set_tracer_flags(unsigned int mask, int enabled) @@ -2394,7 +2395,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int ret; int i; @@ -2406,16 +2407,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); - if (strncmp(buf, "no", 2) == 0) { + if (strncmp(cmp, "no", 2) == 0) { neg = 1; cmp += 2; } for (i = 0; trace_options[i]; i++) { - int len = strlen(trace_options[i]); - - if (strncmp(cmp, trace_options[i], len) == 0) { + if (strcmp(cmp, trace_options[i]) == 0) { set_tracer_flags(1 << i, !neg); break; } @@ -3927,39 +3927,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - ret = 0; - switch (val) { - case 0: - /* do nothing if already cleared */ - if (!(topt->flags->val & topt->opt->bit)) - break; - - mutex_lock(&trace_types_lock); - if (current_trace->set_flag) - ret = current_trace->set_flag(topt->flags->val, - topt->opt->bit, 0); - mutex_unlock(&trace_types_lock); - if (ret) - return ret; - topt->flags->val &= ~topt->opt->bit; - break; - case 1: - /* do nothing if already set */ - if (topt->flags->val & topt->opt->bit) - break; + if (val != 0 && val != 1) + return -EINVAL; + if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - if (current_trace->set_flag) - ret = current_trace->set_flag(topt->flags->val, - topt->opt->bit, 1); + ret = __set_tracer_option(current_trace, topt->flags, + topt->opt, val); mutex_unlock(&trace_types_lock); if (ret) return ret; - topt->flags->val |= topt->opt->bit; - break; - - default: - return -EINVAL; } *ppos += cnt; -- cgit v1.2.1 From e00bf2ec60605eb95687b7a0c3b83c87c48541dc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:17:29 +0800 Subject: tracing: Change event->profile_count to be int type Like total_profile_count, struct ftrace_event_call::profile_count is protected by event_mutex, so it doesn't need to be atomic_t. Signed-off-by: Li Zefan Acked-by: Steven Rostedt Cc: Jason Baron Cc: Masami Hiramatsu Cc: Peter Zijlstra LKML-Reference: <4B1DC549.5010705@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_event_profile.c | 6 +++--- kernel/trace/trace_kprobe.c | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index d9c60f80aa0d..9e25573242cf 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -25,7 +25,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) char *buf; int ret = -ENOMEM; - if (atomic_inc_return(&event->profile_count)) + if (event->profile_count++ > 0) return 0; if (!total_profile_count) { @@ -56,7 +56,7 @@ fail_buf_nmi: perf_trace_buf = NULL; } fail_buf: - atomic_dec(&event->profile_count); + event->profile_count--; return ret; } @@ -83,7 +83,7 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) { char *buf, *nmi_buf; - if (!atomic_add_negative(-1, &event->profile_count)) + if (--event->profile_count > 0) return; event->profile_disable(event); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e3c80e925896..6ed223447a3f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1426,7 +1426,6 @@ static int register_probe_event(struct trace_probe *tp) call->unregfunc = probe_event_disable; #ifdef CONFIG_EVENT_PROFILE - atomic_set(&call->profile_count, -1); call->profile_enable = probe_profile_enable; call->profile_disable = probe_profile_disable; #endif -- cgit v1.2.1 From 472bbe02c92a7a8299d7b16946277d98bb8f4bb7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:17:51 +0800 Subject: tracing/power: Remove two exports trace_power_start and trace_power_end are used in arch/x86/kernel/power.c, and this file can't be compiled as a module, so these two tracepoints don't need to be exported. Signed-off-by: Li Zefan Acked-by: Arjan van de Ven Acked-by: Steven Rostedt LKML-Reference: <4B1DC55F.7060305@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/power-traces.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index e06c6e3d56a3..9f4f565b01e6 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -14,7 +14,5 @@ #define CREATE_TRACE_POINTS #include -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); -- cgit v1.2.1 From 16620e0f1990fa6d896a639449c4b3d678458464 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:18:16 +0800 Subject: ksym_tracer: Fix bad cast Fix this warning: kernel/trace/trace_ksym.c: In function 'ksym_trace_filter_read': kernel/trace/trace_ksym.c:239: warning: cast to pointer from integer of different size Signed-off-by: Li Zefan Acked-by: Steven Rostedt Cc: "K.Prasad" LKML-Reference: <4B1DC578.9020909@cn.fujitsu.com> [remove the strstrip fix as tglx already fixed that] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_ksym.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 64e7a5bd6692..48f1c6c248c6 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -235,7 +235,8 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, mutex_lock(&ksym_tracer_mutex); hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr); + ret = trace_seq_printf(s, "%pS:", + (void *)(unsigned long)entry->attr.bp_addr); if (entry->attr.bp_type == HW_BREAKPOINT_R) ret = trace_seq_puts(s, "r--\n"); else if (entry->attr.bp_type == HW_BREAKPOINT_W) @@ -298,8 +299,8 @@ static ssize_t ksym_trace_filter_write(struct file *file, * 2: echo 0 > ksym_trace_filter * 3: echo "*:---" > ksym_trace_filter */ - if (!input_string[0] || !strcmp(input_string, "0") || - !strcmp(input_string, "*:---")) { + if (!buf[0] || !strcmp(buf, "0") || + !strcmp(buf, "*:---")) { __ksym_trace_reset(); ret = 0; goto out; -- cgit v1.2.1 From 5fe85be081edf0ac92d83f9c39e0ab5c1371eb82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:14:58 +0000 Subject: sched: Use rcu in sys_sched_getscheduler/sys_sched_getparam() read_lock(&tasklist_lock) does not protect sys_sched_getscheduler and sys_sched_getparam() against a concurrent update of the policy or scheduler parameters as do_sched_setscheduler() does not take the tasklist_lock. The accessed integers can be retrieved w/o locking and are snapshots anyway. Using rcu_read_lock() to protect find_task_by_vpid() and prevent the task struct from going away is not changing the above situation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.753790977@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 258c73c6a2f3..1782beed2fa7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6458,7 +6458,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); @@ -6466,7 +6466,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) retval = p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6484,7 +6484,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) @@ -6495,7 +6495,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) goto out_unlock; lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * This one might sleep, we cannot do it with a spinlock held ... @@ -6505,7 +6505,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v1.2.1 From 23f5d142519621b16cf2b378cf8adf4dcf01a616 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:15:01 +0000 Subject: sched: Use rcu in sched_get/set_affinity() tasklist_lock is held read locked to protect the find_task_by_vpid() call and to prevent the task going away. sched_setaffinity acquires a task struct ref and drops tasklist lock right away. The access to the cpus_allowed mask is protected by rq->lock. rcu_read_lock() provides the same protection here. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.789059966@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1782beed2fa7..79893123325c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6516,22 +6516,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return -ESRCH; } - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ + /* Prevent p going away */ get_task_struct(p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; @@ -6617,7 +6613,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -6633,7 +6629,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) task_rq_unlock(rq, &flags); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return retval; -- cgit v1.2.1 From 1a551ae715825bb2a2107a2dd68de024a1fa4e32 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:15:11 +0000 Subject: sched: Use rcu in sched_get_rr_param() read_lock(&tasklist_lock) does not protect sys_sched_get_rr_param() against a concurrent update of the policy or scheduler parameters as do_sched_scheduler() does not take the tasklist_lock. The access to task->sched_class->get_rr_interval is protected by task_rq_lock(task). Use rcu_read_lock() to protect find_task_by_vpid() and prevent the task struct from going away. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.862897167@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 79893123325c..db5c26692dd5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6873,7 +6873,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; @@ -6886,13 +6886,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, time_slice = p->sched_class->get_rr_interval(rq, p); task_rq_unlock(rq, &flags); - read_unlock(&tasklist_lock); + rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v1.2.1 From b7b40ade58e621851896aa261452df99d4e9d99b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Nov 2009 21:01:59 +0100 Subject: locking: Reorder functions in spinlock.c Separate spin_lock and rw_lock functions. Preempt-RT needs to exclude the rw_lock functions from being compiled. The reordering allows to do that with a single #ifdef. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/spinlock.c | 246 +++++++++++++++++++++++++++--------------------------- 1 file changed, 123 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 41e042219ff6..e6e136318437 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -113,41 +113,6 @@ BUILD_LOCK_OPS(write, rwlock); #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) -{ - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -} -EXPORT_SYMBOL(_spin_lock_nested); - -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, - int subclass) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, - _raw_spin_lock_flags, &flags); - return flags; -} -EXPORT_SYMBOL(_spin_lock_irqsave_nested); - -void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, - struct lockdep_map *nest_lock) -{ - preempt_disable(); - spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -} -EXPORT_SYMBOL(_spin_lock_nest_lock); - -#endif - #ifndef CONFIG_INLINE_SPIN_TRYLOCK int __lockfunc _spin_trylock(spinlock_t *lock) { @@ -156,28 +121,20 @@ int __lockfunc _spin_trylock(spinlock_t *lock) EXPORT_SYMBOL(_spin_trylock); #endif -#ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _read_trylock(rwlock_t *lock) -{ - return __read_trylock(lock); -} -EXPORT_SYMBOL(_read_trylock); -#endif - -#ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _write_trylock(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH +int __lockfunc _spin_trylock_bh(spinlock_t *lock) { - return __write_trylock(lock); + return __spin_trylock_bh(lock); } -EXPORT_SYMBOL(_write_trylock); +EXPORT_SYMBOL(_spin_trylock_bh); #endif -#ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _read_lock(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_LOCK +void __lockfunc _spin_lock(spinlock_t *lock) { - __read_lock(lock); + __spin_lock(lock); } -EXPORT_SYMBOL(_read_lock); +EXPORT_SYMBOL(_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE @@ -204,84 +161,76 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock) EXPORT_SYMBOL(_spin_lock_bh); #endif -#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) -{ - return __read_lock_irqsave(lock); -} -EXPORT_SYMBOL(_read_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _read_lock_irq(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK +void __lockfunc _spin_unlock(spinlock_t *lock) { - __read_lock_irq(lock); + __spin_unlock(lock); } -EXPORT_SYMBOL(_read_lock_irq); +EXPORT_SYMBOL(_spin_unlock); #endif -#ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _read_lock_bh(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE +void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { - __read_lock_bh(lock); + __spin_unlock_irqrestore(lock, flags); } -EXPORT_SYMBOL(_read_lock_bh); +EXPORT_SYMBOL(_spin_unlock_irqrestore); #endif -#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ +void __lockfunc _spin_unlock_irq(spinlock_t *lock) { - return __write_lock_irqsave(lock); + __spin_unlock_irq(lock); } -EXPORT_SYMBOL(_write_lock_irqsave); +EXPORT_SYMBOL(_spin_unlock_irq); #endif -#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _write_lock_irq(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH +void __lockfunc _spin_unlock_bh(spinlock_t *lock) { - __write_lock_irq(lock); + __spin_unlock_bh(lock); } -EXPORT_SYMBOL(_write_lock_irq); +EXPORT_SYMBOL(_spin_unlock_bh); #endif -#ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _write_lock_bh(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_TRYLOCK +int __lockfunc _read_trylock(rwlock_t *lock) { - __write_lock_bh(lock); + return __read_trylock(lock); } -EXPORT_SYMBOL(_write_lock_bh); +EXPORT_SYMBOL(_read_trylock); #endif -#ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _spin_lock(spinlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK +void __lockfunc _read_lock(rwlock_t *lock) { - __spin_lock(lock); + __read_lock(lock); } -EXPORT_SYMBOL(_spin_lock); +EXPORT_SYMBOL(_read_lock); #endif -#ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _write_lock(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE +unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) { - __write_lock(lock); + return __read_lock_irqsave(lock); } -EXPORT_SYMBOL(_write_lock); +EXPORT_SYMBOL(_read_lock_irqsave); #endif -#ifndef CONFIG_INLINE_SPIN_UNLOCK -void __lockfunc _spin_unlock(spinlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK_IRQ +void __lockfunc _read_lock_irq(rwlock_t *lock) { - __spin_unlock(lock); + __read_lock_irq(lock); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(_read_lock_irq); #endif -#ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _write_unlock(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK_BH +void __lockfunc _read_lock_bh(rwlock_t *lock) { - __write_unlock(lock); + __read_lock_bh(lock); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(_read_lock_bh); #endif #ifndef CONFIG_INLINE_READ_UNLOCK @@ -292,30 +241,6 @@ void __lockfunc _read_unlock(rwlock_t *lock) EXPORT_SYMBOL(_read_unlock); #endif -#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) -{ - __spin_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_spin_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _spin_unlock_irq(spinlock_t *lock) -{ - __spin_unlock_irq(lock); -} -EXPORT_SYMBOL(_spin_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _spin_unlock_bh(spinlock_t *lock) -{ - __spin_unlock_bh(lock); -} -EXPORT_SYMBOL(_spin_unlock_bh); -#endif - #ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { @@ -340,6 +265,54 @@ void __lockfunc _read_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_read_unlock_bh); #endif +#ifndef CONFIG_INLINE_WRITE_TRYLOCK +int __lockfunc _write_trylock(rwlock_t *lock) +{ + return __write_trylock(lock); +} +EXPORT_SYMBOL(_write_trylock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK +void __lockfunc _write_lock(rwlock_t *lock) +{ + __write_lock(lock); +} +EXPORT_SYMBOL(_write_lock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +{ + return __write_lock_irqsave(lock); +} +EXPORT_SYMBOL(_write_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ +void __lockfunc _write_lock_irq(rwlock_t *lock) +{ + __write_lock_irq(lock); +} +EXPORT_SYMBOL(_write_lock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_BH +void __lockfunc _write_lock_bh(rwlock_t *lock) +{ + __write_lock_bh(lock); +} +EXPORT_SYMBOL(_write_lock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK +void __lockfunc _write_unlock(rwlock_t *lock) +{ + __write_unlock(lock); +} +EXPORT_SYMBOL(_write_unlock); +#endif + #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { @@ -364,12 +337,39 @@ void __lockfunc _write_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_write_unlock_bh); #endif -#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) { - return __spin_trylock_bh(lock); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(_spin_lock_nested); + +unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, + int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, + _raw_spin_lock_flags, &flags); + return flags; +} +EXPORT_SYMBOL(_spin_lock_irqsave_nested); + +void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +} +EXPORT_SYMBOL(_spin_lock_nest_lock); + #endif notrace int in_lock_functions(unsigned long addr) -- cgit v1.2.1 From 445c89514be242b1b0080056d50bdc1b72adeb5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Dec 2009 19:49:50 +0100 Subject: locking: Convert raw_spinlock to arch_spinlock The raw_spin* namespace was taken by lockdep for the architecture specific implementations. raw_spin_* would be the ideal name space for the spinlocks which are not converted to sleeping locks in preempt-rt. Linus suggested to convert the raw_ to arch_ locks and cleanup the name space instead of using an artifical name like core_spin, atomic_spin or whatever No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- kernel/lockdep.c | 2 +- kernel/trace/ring_buffer.c | 4 ++-- kernel/trace/trace.c | 18 +++++++++--------- kernel/trace/trace_clock.c | 4 ++-- kernel/trace/trace_sched_wakeup.c | 4 ++-- kernel/trace/trace_stack.c | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 429540c70d3f..7cc50c62af59 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -73,7 +73,7 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static int graph_lock(void) { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1ca4956ab5e..5ac8ee0a9e35 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -423,7 +423,7 @@ struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; spinlock_t reader_lock; /* serialize readers */ - raw_spinlock_t lock; + arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; struct buffer_page *head_page; /* read from head */ @@ -998,7 +998,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); - cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + cpu_buffer->lock = (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c82dfd92fdfd..7d56cecc2c6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -493,15 +493,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) * protected by per_cpu spinlocks. But the action of the swap * needs its own lock. * - * This is defined as a raw_spinlock_t in order to help + * This is defined as a arch_spinlock_t in order to help * with performance when lockdep debugging is enabled. * * It is also used in other places outside the update_max_tr * so it needs to be defined outside of the * CONFIG_TRACER_MAX_TRACE. */ -static raw_spinlock_t ftrace_max_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t ftrace_max_lock = + (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE unsigned long __read_mostly tracing_max_latency; @@ -802,7 +802,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; -static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; @@ -1251,8 +1251,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - static raw_spinlock_t trace_buf_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t trace_buf_lock = + (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_bprint; @@ -1334,7 +1334,7 @@ int trace_array_printk(struct trace_array *tr, int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { - static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_print; @@ -4307,8 +4307,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static raw_spinlock_t ftrace_dump_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t ftrace_dump_lock = + (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 878c03f386ba..206ec3d4b3c2 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -71,10 +71,10 @@ u64 notrace trace_clock(void) /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; - raw_spinlock_t lock; + arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { - .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + .lock = (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 26185d727676..4cf7e83ec235 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -28,8 +28,8 @@ static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; -static raw_spinlock_t wakeup_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t wakeup_lock = + (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static void __wakeup_reset(struct trace_array *tr); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8504ac71e4e8..9a82d568fdec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = { }; static unsigned long max_stack_size; -static raw_spinlock_t max_stack_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t max_stack_lock = + (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); -- cgit v1.2.1 From edc35bd72e2079b25f99c5da7d7a65dbbffc4a26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Dec 2009 12:38:57 +0100 Subject: locking: Rename __RAW_SPIN_LOCK_UNLOCKED to __ARCH_SPIN_LOCK_UNLOCKED Further name space cleanup. No functional change Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- kernel/lockdep.c | 2 +- kernel/trace/ring_buffer.c | 2 +- kernel/trace/trace.c | 10 +++++----- kernel/trace/trace_clock.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_stack.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7cc50c62af59..2389e3f85cf6 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -73,7 +73,7 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static int graph_lock(void) { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5ac8ee0a9e35..fb7a0fa508b9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -998,7 +998,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); - cpu_buffer->lock = (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d56cecc2c6e..63bc1cc38219 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -501,7 +501,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) * CONFIG_TRACER_MAX_TRACE. */ static arch_spinlock_t ftrace_max_lock = - (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE unsigned long __read_mostly tracing_max_latency; @@ -802,7 +802,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; -static arch_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; @@ -1252,7 +1252,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { static arch_spinlock_t trace_buf_lock = - (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_bprint; @@ -1334,7 +1334,7 @@ int trace_array_printk(struct trace_array *tr, int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { - static arch_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_print; @@ -4308,7 +4308,7 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { static arch_spinlock_t ftrace_dump_lock = - (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 206ec3d4b3c2..433e2eda2d01 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -74,7 +74,7 @@ static struct { arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { - .lock = (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4cf7e83ec235..e347853564e9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -29,7 +29,7 @@ static unsigned wakeup_prio = -1; static int wakeup_rt; static arch_spinlock_t wakeup_lock = - (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static void __wakeup_reset(struct trace_array *tr); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 9a82d568fdec..728c35221483 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -28,7 +28,7 @@ static struct stack_trace max_stack_trace = { static unsigned long max_stack_size; static arch_spinlock_t max_stack_lock = - (arch_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); -- cgit v1.2.1 From 0199c4e68d1f02894bdefe4b5d9e9ee4aedd8d62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Dec 2009 20:01:25 +0100 Subject: locking: Convert __raw_spin* functions to arch_spin* Name space cleanup. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- kernel/lockdep.c | 18 +++++++++--------- kernel/mutex-debug.h | 4 ++-- kernel/spinlock.c | 4 ++-- kernel/trace/ring_buffer.c | 12 ++++++------ kernel/trace/trace.c | 32 ++++++++++++++++---------------- kernel/trace/trace_clock.c | 4 ++-- kernel/trace/trace_sched_wakeup.c | 12 ++++++------ kernel/trace/trace_selftest.c | 4 ++-- kernel/trace/trace_stack.c | 12 ++++++------ 9 files changed, 51 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2389e3f85cf6..5feaddcdbe49 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -77,7 +77,7 @@ static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED static int graph_lock(void) { - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -85,7 +85,7 @@ static int graph_lock(void) * dropped already) */ if (!debug_locks) { - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return 0; } /* prevent any recursions within lockdep from causing deadlocks */ @@ -95,11 +95,11 @@ static int graph_lock(void) static inline int graph_unlock(void) { - if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) + if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); current->lockdep_recursion--; - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return 0; } @@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void) { int ret = debug_locks_off(); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return ret; } @@ -1170,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.class = class; local_irq_save(flags); - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); ret = __lockdep_count_forward_deps(&this); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; @@ -1197,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.class = class; local_irq_save(flags); - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); ret = __lockdep_count_backward_deps(&this); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 6b2d735846a5..7bebbd15b342 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock) \ DEBUG_LOCKS_WARN_ON(in_interrupt()); \ local_irq_save(flags); \ - __raw_spin_lock(&(lock)->raw_lock); \ + arch_spin_lock(&(lock)->raw_lock); \ DEBUG_LOCKS_WARN_ON(l->magic != l); \ } while (0) #define spin_unlock_mutex(lock, flags) \ do { \ - __raw_spin_unlock(&(lock)->raw_lock); \ + arch_spin_unlock(&(lock)->raw_lock); \ local_irq_restore(flags); \ preempt_check_resched(); \ } while (0) diff --git a/kernel/spinlock.c b/kernel/spinlock.c index e6e136318437..fbb5f8b78357 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -53,7 +53,7 @@ void __lockfunc __##op##_lock(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ @@ -73,7 +73,7 @@ unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fb7a0fa508b9..f58c9ad15830 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2834,7 +2834,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) int ret; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); again: /* @@ -2923,7 +2923,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return reader; @@ -3286,9 +3286,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) synchronize_sched(); spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return iter; @@ -3408,11 +3408,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 63bc1cc38219..bb6b5e7fa2a2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -555,13 +555,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; max_tr.buffer = buf; __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); } /** @@ -581,7 +581,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); @@ -603,7 +603,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ @@ -915,7 +915,7 @@ static void trace_save_cmdline(struct task_struct *tsk) * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!__raw_spin_trylock(&trace_cmdline_lock)) + if (!arch_spin_trylock(&trace_cmdline_lock)) return; idx = map_pid_to_cmdline[tsk->pid]; @@ -940,7 +940,7 @@ static void trace_save_cmdline(struct task_struct *tsk) memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); - __raw_spin_unlock(&trace_cmdline_lock); + arch_spin_unlock(&trace_cmdline_lock); } void trace_find_cmdline(int pid, char comm[]) @@ -958,14 +958,14 @@ void trace_find_cmdline(int pid, char comm[]) } preempt_disable(); - __raw_spin_lock(&trace_cmdline_lock); + arch_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) strcpy(comm, saved_cmdlines[map]); else strcpy(comm, "<...>"); - __raw_spin_unlock(&trace_cmdline_lock); + arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } @@ -1283,7 +1283,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) /* Lockdep uses trace_printk for lock tracing */ local_irq_save(flags); - __raw_spin_lock(&trace_buf_lock); + arch_spin_lock(&trace_buf_lock); len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); if (len > TRACE_BUF_SIZE || len < 0) @@ -1304,7 +1304,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) ring_buffer_unlock_commit(buffer, event); out_unlock: - __raw_spin_unlock(&trace_buf_lock); + arch_spin_unlock(&trace_buf_lock); local_irq_restore(flags); out: @@ -1360,7 +1360,7 @@ int trace_array_vprintk(struct trace_array *tr, pause_graph_tracing(); raw_local_irq_save(irq_flags); - __raw_spin_lock(&trace_buf_lock); + arch_spin_lock(&trace_buf_lock); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; @@ -1378,7 +1378,7 @@ int trace_array_vprintk(struct trace_array *tr, ring_buffer_unlock_commit(buffer, event); out_unlock: - __raw_spin_unlock(&trace_buf_lock); + arch_spin_unlock(&trace_buf_lock); raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: @@ -2279,7 +2279,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -2294,7 +2294,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, atomic_dec(&global_trace.data[cpu]->disabled); } } - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); local_irq_enable(); cpumask_copy(tracing_cpumask, tracing_cpumask_new); @@ -4318,7 +4318,7 @@ static void __ftrace_dump(bool disable_tracing) /* only one dump */ local_irq_save(flags); - __raw_spin_lock(&ftrace_dump_lock); + arch_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4393,7 +4393,7 @@ static void __ftrace_dump(bool disable_tracing) } out: - __raw_spin_unlock(&ftrace_dump_lock); + arch_spin_unlock(&ftrace_dump_lock); local_irq_restore(flags); } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 433e2eda2d01..84a3a7ba072a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -94,7 +94,7 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_struct.lock); + arch_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset @@ -106,7 +106,7 @@ u64 notrace trace_clock_global(void) trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_struct.lock); + arch_spin_unlock(&trace_clock_struct.lock); out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e347853564e9..0271742abb8d 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); /* We could race with grabbing wakeup_lock */ if (unlikely(!tracer_enabled || next != wakeup_task)) @@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, out_unlock: __wakeup_reset(wakeup_trace); - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&wakeup_trace->data[cpu]->disabled); @@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr) tracing_reset_online_cpus(tr); local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); __wakeup_reset(tr); - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); } @@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) goto out; /* interrupts should be off from try_to_wake_up */ - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); /* check for races. */ if (!tracer_enabled || p->prio >= wakeup_prio) @@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index dc98309e839a..280fea470d67 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) /* Don't allow flipping of max traces now */ local_irq_save(flags); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); cnt = ring_buffer_entries(tr->buffer); @@ -85,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) break; } tracing_on(); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); local_irq_restore(flags); if (count) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 728c35221483..678a5120ee30 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -54,7 +54,7 @@ static inline void check_stack(void) return; local_irq_save(flags); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); /* a race could have already updated it */ if (this_size <= max_stack_size) @@ -103,7 +103,7 @@ static inline void check_stack(void) } out: - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); } @@ -171,9 +171,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); *ptr = val; - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); return count; @@ -207,7 +207,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { local_irq_disable(); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); if (*pos == 0) return SEQ_START_TOKEN; @@ -217,7 +217,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_enable(); } -- cgit v1.2.1 From c2f21ce2e31286a0a32f8da0a7856e9ca1122ef3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Dec 2009 20:02:59 +0100 Subject: locking: Implement new raw_spinlock Now that the raw_spin name space is freed up, we can implement raw_spinlock and the related functions which are used to annotate the locks which are not converted to sleeping spinlocks in preempt-rt. A side effect is that only such locks can be used with the low level lock fsunctions which circumvent lockdep. For !rt spin_* functions are mapped to the raw_spin* implementations. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/mutex-debug.h | 12 ++++++------ kernel/sched.c | 2 +- kernel/spinlock.c | 34 ++++++++++++++++++---------------- 3 files changed, 25 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 7bebbd15b342..57d527a16f9d 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock) \ DEBUG_LOCKS_WARN_ON(in_interrupt()); \ local_irq_save(flags); \ - arch_spin_lock(&(lock)->raw_lock); \ + arch_spin_lock(&(lock)->rlock.raw_lock);\ DEBUG_LOCKS_WARN_ON(l->magic != l); \ } while (0) -#define spin_unlock_mutex(lock, flags) \ - do { \ - arch_spin_unlock(&(lock)->raw_lock); \ - local_irq_restore(flags); \ - preempt_check_resched(); \ +#define spin_unlock_mutex(lock, flags) \ + do { \ + arch_spin_unlock(&(lock)->rlock.raw_lock); \ + local_irq_restore(flags); \ + preempt_check_resched(); \ } while (0) diff --git a/kernel/sched.c b/kernel/sched.c index fd05861b2111..e6acf2d7b753 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -884,7 +884,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; + rq->lock.rlock.owner = current; #endif /* * If we are tracking spinlock dependencies then we have to diff --git a/kernel/spinlock.c b/kernel/spinlock.c index fbb5f8b78357..54eb7dd3c608 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -32,6 +32,8 @@ * include/linux/spinlock_api_smp.h */ #else +#define raw_read_can_lock(l) read_can_lock(l) +#define raw_write_can_lock(l) write_can_lock(l) /* * We build the __lock_function inlines here. They are too large for * inlining all over the place, but here is only one user per function @@ -52,7 +54,7 @@ void __lockfunc __##op##_lock(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ @@ -72,7 +74,7 @@ unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ @@ -107,14 +109,14 @@ void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_irqsave() * __[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock); +BUILD_LOCK_OPS(spin, raw_spinlock); BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _spin_trylock(spinlock_t *lock) +int __lockfunc _spin_trylock(raw_spinlock_t *lock) { return __spin_trylock(lock); } @@ -122,7 +124,7 @@ EXPORT_SYMBOL(_spin_trylock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +int __lockfunc _spin_trylock_bh(raw_spinlock_t *lock) { return __spin_trylock_bh(lock); } @@ -130,7 +132,7 @@ EXPORT_SYMBOL(_spin_trylock_bh); #endif #ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _spin_lock(spinlock_t *lock) +void __lockfunc _spin_lock(raw_spinlock_t *lock) { __spin_lock(lock); } @@ -138,7 +140,7 @@ EXPORT_SYMBOL(_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +unsigned long __lockfunc _spin_lock_irqsave(raw_spinlock_t *lock) { return __spin_lock_irqsave(lock); } @@ -146,7 +148,7 @@ EXPORT_SYMBOL(_spin_lock_irqsave); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _spin_lock_irq(spinlock_t *lock) +void __lockfunc _spin_lock_irq(raw_spinlock_t *lock) { __spin_lock_irq(lock); } @@ -154,7 +156,7 @@ EXPORT_SYMBOL(_spin_lock_irq); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _spin_lock_bh(spinlock_t *lock) +void __lockfunc _spin_lock_bh(raw_spinlock_t *lock) { __spin_lock_bh(lock); } @@ -162,7 +164,7 @@ EXPORT_SYMBOL(_spin_lock_bh); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK -void __lockfunc _spin_unlock(spinlock_t *lock) +void __lockfunc _spin_unlock(raw_spinlock_t *lock) { __spin_unlock(lock); } @@ -170,7 +172,7 @@ EXPORT_SYMBOL(_spin_unlock); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +void __lockfunc _spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { __spin_unlock_irqrestore(lock, flags); } @@ -178,7 +180,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +void __lockfunc _spin_unlock_irq(raw_spinlock_t *lock) { __spin_unlock_irq(lock); } @@ -186,7 +188,7 @@ EXPORT_SYMBOL(_spin_unlock_irq); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +void __lockfunc _spin_unlock_bh(raw_spinlock_t *lock) { __spin_unlock_bh(lock); } @@ -339,7 +341,7 @@ EXPORT_SYMBOL(_write_unlock_bh); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +void __lockfunc _spin_lock_nested(raw_spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); @@ -347,7 +349,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_spin_lock_nested); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, +unsigned long __lockfunc _spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { unsigned long flags; @@ -361,7 +363,7 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, } EXPORT_SYMBOL(_spin_lock_irqsave_nested); -void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, +void __lockfunc _spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *nest_lock) { preempt_disable(); -- cgit v1.2.1 From 9828ea9d75c38fe3dce05d00566eed61c85732e6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Dec 2009 20:55:53 +0100 Subject: locking: Further name space cleanups The name space hierarchy for the internal lock functions is now a bit backwards. raw_spin* functions map to _spin* which use __spin*, while we would like to have _raw_spin* and __raw_spin*. _raw_spin* is already used by lock debugging, so rename those funtions to do_raw_spin* to free up the _raw_spin* name space. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/spinlock.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e6acf2d7b753..91c65dd91435 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6684,7 +6684,7 @@ SYSCALL_DEFINE0(sched_yield) */ __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); + do_raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); schedule(); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 54eb7dd3c608..795240b81224 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -48,7 +48,7 @@ void __lockfunc __##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(do_raw_##op##_trylock(lock))) \ break; \ preempt_enable(); \ \ @@ -67,7 +67,7 @@ unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ for (;;) { \ preempt_disable(); \ local_irq_save(flags); \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(do_raw_##op##_trylock(lock))) \ break; \ local_irq_restore(flags); \ preempt_enable(); \ @@ -345,7 +345,7 @@ void __lockfunc _spin_lock_nested(raw_spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); } EXPORT_SYMBOL(_spin_lock_nested); @@ -357,8 +357,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(raw_spinlock_t *lock, local_irq_save(flags); preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, - _raw_spin_lock_flags, &flags); + LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, + do_raw_spin_lock_flags, &flags); return flags; } EXPORT_SYMBOL(_spin_lock_irqsave_nested); @@ -368,7 +368,7 @@ void __lockfunc _spin_lock_nest_lock(raw_spinlock_t *lock, { preempt_disable(); spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); } EXPORT_SYMBOL(_spin_lock_nest_lock); -- cgit v1.2.1 From 9c1721aa4994f6625decbd915241f3a94ee2fe67 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Dec 2009 21:52:18 +0100 Subject: locking: Cleanup the name space completely Make the name space hierarchy of locking functions consistent: raw_spin* -> _raw_spin* -> __raw_spin* No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/spinlock.c | 192 +++++++++++++++++++++++++++--------------------------- 1 file changed, 96 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 795240b81224..be6517fb9c14 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -44,7 +44,7 @@ * towards that other CPU that it should break the lock ASAP. */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc __##op##_lock(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -60,7 +60,7 @@ void __lockfunc __##op##_lock(locktype##_t *lock) \ (lock)->break_lock = 0; \ } \ \ -unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ +unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -81,12 +81,12 @@ unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ return flags; \ } \ \ -void __lockfunc __##op##_lock_irq(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ { \ - _##op##_lock_irqsave(lock); \ + _raw_##op##_lock_irqsave(lock); \ } \ \ -void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -95,7 +95,7 @@ void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ /* irq-disabling. We use the generic preemption-aware */ \ /* function: */ \ /**/ \ - flags = _##op##_lock_irqsave(lock); \ + flags = _raw_##op##_lock_irqsave(lock); \ local_bh_disable(); \ local_irq_restore(flags); \ } \ @@ -116,240 +116,240 @@ BUILD_LOCK_OPS(write, rwlock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _spin_trylock(raw_spinlock_t *lock) +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { - return __spin_trylock(lock); + return __raw_spin_trylock(lock); } -EXPORT_SYMBOL(_spin_trylock); +EXPORT_SYMBOL(_raw_spin_trylock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _spin_trylock_bh(raw_spinlock_t *lock) +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { - return __spin_trylock_bh(lock); + return __raw_spin_trylock_bh(lock); } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(_raw_spin_trylock_bh); #endif #ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _spin_lock(raw_spinlock_t *lock) +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { - __spin_lock(lock); + __raw_spin_lock(lock); } -EXPORT_SYMBOL(_spin_lock); +EXPORT_SYMBOL(_raw_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _spin_lock_irqsave(raw_spinlock_t *lock) +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { - return __spin_lock_irqsave(lock); + return __raw_spin_lock_irqsave(lock); } -EXPORT_SYMBOL(_spin_lock_irqsave); +EXPORT_SYMBOL(_raw_spin_lock_irqsave); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _spin_lock_irq(raw_spinlock_t *lock) +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { - __spin_lock_irq(lock); + __raw_spin_lock_irq(lock); } -EXPORT_SYMBOL(_spin_lock_irq); +EXPORT_SYMBOL(_raw_spin_lock_irq); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _spin_lock_bh(raw_spinlock_t *lock) +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { - __spin_lock_bh(lock); + __raw_spin_lock_bh(lock); } -EXPORT_SYMBOL(_spin_lock_bh); +EXPORT_SYMBOL(_raw_spin_lock_bh); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK -void __lockfunc _spin_unlock(raw_spinlock_t *lock) +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { - __spin_unlock(lock); + __raw_spin_unlock(lock); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(_raw_spin_unlock); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { - __spin_unlock_irqrestore(lock, flags); + __raw_spin_unlock_irqrestore(lock, flags); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _spin_unlock_irq(raw_spinlock_t *lock) +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { - __spin_unlock_irq(lock); + __raw_spin_unlock_irq(lock); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _spin_unlock_bh(raw_spinlock_t *lock) +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { - __spin_unlock_bh(lock); + __raw_spin_unlock_bh(lock); } -EXPORT_SYMBOL(_spin_unlock_bh); +EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif #ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _read_trylock(rwlock_t *lock) +int __lockfunc _raw_read_trylock(rwlock_t *lock) { - return __read_trylock(lock); + return __raw_read_trylock(lock); } -EXPORT_SYMBOL(_read_trylock); +EXPORT_SYMBOL(_raw_read_trylock); #endif #ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _read_lock(rwlock_t *lock) +void __lockfunc _raw_read_lock(rwlock_t *lock) { - __read_lock(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock); +EXPORT_SYMBOL(_raw_read_lock); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { - return __read_lock_irqsave(lock); + return __raw_read_lock_irqsave(lock); } -EXPORT_SYMBOL(_read_lock_irqsave); +EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _read_lock_irq(rwlock_t *lock) +void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { - __read_lock_irq(lock); + __raw_read_lock_irq(lock); } -EXPORT_SYMBOL(_read_lock_irq); +EXPORT_SYMBOL(_raw_read_lock_irq); #endif #ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _read_lock_bh(rwlock_t *lock) +void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { - __read_lock_bh(lock); + __raw_read_lock_bh(lock); } -EXPORT_SYMBOL(_read_lock_bh); +EXPORT_SYMBOL(_raw_read_lock_bh); #endif #ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _read_unlock(rwlock_t *lock) +void __lockfunc _raw_read_unlock(rwlock_t *lock) { - __read_unlock(lock); + __raw_read_unlock(lock); } -EXPORT_SYMBOL(_read_unlock); +EXPORT_SYMBOL(_raw_read_unlock); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - __read_unlock_irqrestore(lock, flags); + __raw_read_unlock_irqrestore(lock, flags); } -EXPORT_SYMBOL(_read_unlock_irqrestore); +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _read_unlock_irq(rwlock_t *lock) +void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { - __read_unlock_irq(lock); + __raw_read_unlock_irq(lock); } -EXPORT_SYMBOL(_read_unlock_irq); +EXPORT_SYMBOL(_raw_read_unlock_irq); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _read_unlock_bh(rwlock_t *lock) +void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { - __read_unlock_bh(lock); + __raw_read_unlock_bh(lock); } -EXPORT_SYMBOL(_read_unlock_bh); +EXPORT_SYMBOL(_raw_read_unlock_bh); #endif #ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc _raw_write_trylock(rwlock_t *lock) { - return __write_trylock(lock); + return __raw_write_trylock(lock); } -EXPORT_SYMBOL(_write_trylock); +EXPORT_SYMBOL(_raw_write_trylock); #endif #ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _write_lock(rwlock_t *lock) +void __lockfunc _raw_write_lock(rwlock_t *lock) { - __write_lock(lock); + __raw_write_lock(lock); } -EXPORT_SYMBOL(_write_lock); +EXPORT_SYMBOL(_raw_write_lock); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) { - return __write_lock_irqsave(lock); + return __raw_write_lock_irqsave(lock); } -EXPORT_SYMBOL(_write_lock_irqsave); +EXPORT_SYMBOL(_raw_write_lock_irqsave); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _write_lock_irq(rwlock_t *lock) +void __lockfunc _raw_write_lock_irq(rwlock_t *lock) { - __write_lock_irq(lock); + __raw_write_lock_irq(lock); } -EXPORT_SYMBOL(_write_lock_irq); +EXPORT_SYMBOL(_raw_write_lock_irq); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _write_lock_bh(rwlock_t *lock) +void __lockfunc _raw_write_lock_bh(rwlock_t *lock) { - __write_lock_bh(lock); + __raw_write_lock_bh(lock); } -EXPORT_SYMBOL(_write_lock_bh); +EXPORT_SYMBOL(_raw_write_lock_bh); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _write_unlock(rwlock_t *lock) +void __lockfunc _raw_write_unlock(rwlock_t *lock) { - __write_unlock(lock); + __raw_write_unlock(lock); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(_raw_write_unlock); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - __write_unlock_irqrestore(lock, flags); + __raw_write_unlock_irqrestore(lock, flags); } -EXPORT_SYMBOL(_write_unlock_irqrestore); +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _write_unlock_irq(rwlock_t *lock) +void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) { - __write_unlock_irq(lock); + __raw_write_unlock_irq(lock); } -EXPORT_SYMBOL(_write_unlock_irq); +EXPORT_SYMBOL(_raw_write_unlock_irq); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _write_unlock_bh(rwlock_t *lock) +void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) { - __write_unlock_bh(lock); + __raw_write_unlock_bh(lock); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(_raw_write_unlock_bh); #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC -void __lockfunc _spin_lock_nested(raw_spinlock_t *lock, int subclass) +void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_nested); +EXPORT_SYMBOL(_raw_spin_lock_nested); -unsigned long __lockfunc _spin_lock_irqsave_nested(raw_spinlock_t *lock, +unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { unsigned long flags; @@ -361,16 +361,16 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(raw_spinlock_t *lock, do_raw_spin_lock_flags, &flags); return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave_nested); +EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); -void __lockfunc _spin_lock_nest_lock(raw_spinlock_t *lock, +void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *nest_lock) { preempt_disable(); spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_nest_lock); +EXPORT_SYMBOL(_raw_spin_lock_nest_lock); #endif -- cgit v1.2.1 From a26724591edba5acc528d41f3906a972590e8f54 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 14:46:14 +0100 Subject: plist: Make plist debugging raw_spinlock aware plists are used with spinlocks and raw_spinlocks. Change the plist debugging to handle both types. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d73ef1f3e55d..6af474df17bb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,7 +1010,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb2->lock; + q->list.plist.spinlock = &hb2->lock; #endif } get_futex_key_refs(key2); @@ -1046,7 +1046,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->lock_ptr = &hb->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.spinlock = &hb->lock; #endif wake_up_state(q->task, TASK_NORMAL); @@ -1394,7 +1394,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.spinlock = &hb->lock; #endif plist_add(&q->list, &hb->chain); q->task = current; -- cgit v1.2.1 From 05fa785cf80c9b7c0254c3056037147aed3ea16b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 14:28:38 +0100 Subject: sched: Convert rq->lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/sched.c | 183 ++++++++++++++++++++++++------------------------ kernel/sched_debug.c | 4 +- kernel/sched_fair.c | 4 +- kernel/sched_idletask.c | 4 +- kernel/sched_rt.c | 14 ++-- 5 files changed, 106 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 91c65dd91435..3dbe6178ebfd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -525,7 +525,7 @@ static struct root_domain def_root_domain; */ struct rq { /* runqueue lock: */ - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -685,7 +685,7 @@ inline void update_rq_clock(struct rq *rq) */ int runqueue_is_locked(int cpu) { - return spin_is_locked(&cpu_rq(cpu)->lock); + return raw_spin_is_locked(&cpu_rq(cpu)->lock); } /* @@ -884,7 +884,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.rlock.owner = current; + rq->lock.owner = current; #endif /* * If we are tracking spinlock dependencies then we have to @@ -893,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -917,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) next->oncpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); #else - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); #endif } @@ -949,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) { for (;;) { struct rq *rq = task_rq(p); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } } @@ -969,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) for (;;) { local_irq_save(*flags); rq = task_rq(p); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock_irqrestore(&rq->lock, *flags); } } @@ -981,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p) struct rq *rq = task_rq(p); smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - spin_unlock_wait(&rq->lock); + raw_spin_unlock_wait(&rq->lock); } static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) __releases(rq->lock) { - spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock_irqrestore(&rq->lock, *flags); } /* @@ -1006,7 +1006,7 @@ static struct rq *this_rq_lock(void) local_irq_disable(); rq = this_rq(); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); return rq; } @@ -1053,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); return HRTIMER_NORESTART; } @@ -1069,10 +1069,10 @@ static void __hrtick_start(void *arg) { struct rq *rq = arg; - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); hrtimer_restart(&rq->hrtick_timer); rq->hrtick_csd_pending = 0; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } /* @@ -1179,7 +1179,7 @@ static void resched_task(struct task_struct *p) { int cpu; - assert_spin_locked(&task_rq(p)->lock); + assert_raw_spin_locked(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -1201,10 +1201,10 @@ static void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!spin_trylock_irqsave(&rq->lock, flags)) + if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } #ifdef CONFIG_NO_HZ @@ -1273,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { - assert_spin_locked(&task_rq(p)->lock); + assert_raw_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } @@ -1600,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, struct rq *rq = cpu_rq(cpu); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } } @@ -1706,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) if (root_task_group_empty()) return; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); update_shares(sd); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); } static void update_h_load(long cpu) @@ -1748,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); double_rq_lock(this_rq, busiest); return 1; @@ -1769,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) { int ret = 0; - if (unlikely(!spin_trylock(&busiest->lock))) { + if (unlikely(!raw_spin_trylock(&busiest->lock))) { if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); } return ret; } @@ -1790,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) { if (unlikely(!irqs_disabled())) { /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); BUG_ON(1); } @@ -1800,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - spin_unlock(&busiest->lock); + raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } #endif @@ -2023,13 +2025,13 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) return; } - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; p->flags |= PF_THREAD_BOUND; - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } EXPORT_SYMBOL(kthread_bind); @@ -2781,10 +2783,10 @@ static inline void post_schedule(struct rq *rq) if (rq->post_schedule) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->curr->sched_class->post_schedule) rq->curr->sched_class->post_schedule(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->post_schedule = 0; } @@ -3066,15 +3068,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) { BUG_ON(!irqs_disabled()); if (rq1 == rq2) { - spin_lock(&rq1->lock); + raw_spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { if (rq1 < rq2) { - spin_lock(&rq1->lock); - spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); } else { - spin_lock(&rq2->lock); - spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -3091,9 +3093,9 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - spin_unlock(&rq1->lock); + raw_spin_unlock(&rq1->lock); if (rq1 != rq2) - spin_unlock(&rq2->lock); + raw_spin_unlock(&rq2->lock); else __release(rq2->lock); } @@ -4186,14 +4188,15 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock_irqsave(&busiest->lock, flags); + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { - spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_unlock_irqrestore(&busiest->lock, + flags); all_pinned = 1; goto out_one_pinned; } @@ -4203,7 +4206,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -4385,10 +4388,10 @@ redo: /* * Should not call ttwu while holding a rq->lock */ - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); if (active_balance) wake_up_process(busiest->migration_thread); - spin_lock(&this_rq->lock); + raw_spin_lock(&this_rq->lock); } else sd->nr_balance_failed = 0; @@ -5257,11 +5260,11 @@ void scheduler_tick(void) sched_clock_tick(); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); perf_event_task_tick(curr, cpu); @@ -5455,7 +5458,7 @@ need_resched_nonpreemptible: if (sched_feat(HRTICK)) hrtick_clear(rq); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); clear_tsk_need_resched(prev); @@ -5491,7 +5494,7 @@ need_resched_nonpreemptible: cpu = smp_processor_id(); rq = cpu_rq(cpu); } else - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); post_schedule(rq); @@ -6980,7 +6983,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); idle->se.exec_start = sched_clock(); @@ -6992,7 +6995,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; #endif - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ #if defined(CONFIG_PREEMPT) @@ -7209,10 +7212,10 @@ static int migration_thread(void *data) struct migration_req *req; struct list_head *head; - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); break; } @@ -7224,7 +7227,7 @@ static int migration_thread(void *data) head = &rq->migration_queue; if (list_empty(head)) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); schedule(); set_current_state(TASK_INTERRUPTIBLE); continue; @@ -7233,14 +7236,14 @@ static int migration_thread(void *data) list_del_init(head->next); if (req->task != NULL) { - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); __migrate_task(req->task, cpu, req->dest_cpu); } else if (likely(cpu == (badcpu = smp_processor_id()))) { req->dest_cpu = RCU_MIGRATION_GOT_QS; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } else { req->dest_cpu = RCU_MIGRATION_MUST_SYNC; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); } local_irq_enable(); @@ -7363,14 +7366,14 @@ void sched_idle_next(void) * Strictly not necessary since rest of the CPUs are stopped by now * and interrupts disabled on the current cpu. */ - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); update_rq_clock(rq); activate_task(rq, p, 0); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -7406,9 +7409,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * that's OK. No task can be added to this CPU, so iteration is * fine. */ - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); put_task_struct(p); } @@ -7674,13 +7677,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU @@ -7705,13 +7708,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) put_task_struct(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); @@ -7721,30 +7724,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) * they didn't take sched_hotcpu_mutex. Just wake up * the requestors. */ - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { struct migration_req *req; req = list_entry(rq->migration_queue.next, struct migration_req, list); list_del_init(&req->list); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); complete(&req->done); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); } - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); break; case CPU_DYING: case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); break; #endif } @@ -7974,7 +7977,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { old_rd = rq->rd; @@ -8000,7 +8003,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) free_rootdomain(old_rd); @@ -9357,7 +9360,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; @@ -9523,7 +9526,7 @@ void __init sched_init(void) struct rq *rq; rq = cpu_rq(i); - spin_lock_init(&rq->lock); + raw_spin_lock_init(&rq->lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -10115,9 +10118,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) struct rq *rq = cfs_rq->rq; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __set_se_shares(se, shares); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -10717,9 +10720,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ - spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_lock_irq(&cpu_rq(cpu)->lock); data = *cpuusage; - spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); #else data = *cpuusage; #endif @@ -10735,9 +10738,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ - spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_lock_irq(&cpu_rq(cpu)->lock); *cpuusage = val; - spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); #else *cpuusage = val; #endif @@ -10971,9 +10974,9 @@ void synchronize_sched_expedited(void) init_completion(&req->done); req->task = NULL; req->dest_cpu = RCU_MIGRATION_NEED_QS; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); list_add(&req->list, &rq->migration_queue); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); wake_up_process(rq->migration_thread); } for_each_online_cpu(cpu) { @@ -10981,11 +10984,11 @@ void synchronize_sched_expedited(void) req = &per_cpu(rcu_migration_req, cpu); rq = cpu_rq(cpu); wait_for_completion(&req->done); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) need_full_sync = 1; req->dest_cpu = RCU_MIGRATION_IDLE; - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; synchronize_sched_expedited_count++; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ae24fc65d75..67f95aada4b9 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); @@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 804a411838f1..5bedf6e3ebf3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1955,7 +1955,7 @@ static void task_fork_fair(struct task_struct *p) struct rq *rq = this_rq(); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (unlikely(task_cpu(p) != this_cpu)) __set_task_cpu(p, this_cpu); @@ -1975,7 +1975,7 @@ static void task_fork_fair(struct task_struct *p) resched_task(rq->curr); } - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 33d5384a73a8..5f93b570d383 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq) static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); } static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index aecbd9c6b20c..a8325a7ff94c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -454,9 +454,9 @@ static void disable_runtime(struct rq *rq) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __disable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static void __enable_runtime(struct rq *rq) @@ -486,9 +486,9 @@ static void enable_runtime(struct rq *rq) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __enable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static int balance_runtime(struct rt_rq *rt_rq) @@ -524,7 +524,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { u64 runtime; @@ -545,7 +545,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } return idle; @@ -1246,7 +1246,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->se.on_rq)) { - spin_unlock(&lowest_rq->lock); + raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; break; } -- cgit v1.2.1 From 0986b11b12ad2baed338385f79fd0dec58a23fc6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 15:32:06 +0100 Subject: sched: Convert rt_runtime_lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++++-------------- kernel/sched_rt.c | 46 +++++++++++++++++++++++----------------------- 2 files changed, 37 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3dbe6178ebfd..01c5016e57f1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -141,7 +141,7 @@ struct rt_prio_array { struct rt_bandwidth { /* nests inside the rq lock: */ - spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer; @@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period = ns_to_ktime(period); rt_b->rt_runtime = runtime; - spin_lock_init(&rt_b->rt_runtime_lock); + raw_spin_lock_init(&rt_b->rt_runtime_lock); hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) if (hrtimer_active(&rt_b->rt_period_timer)) return; - spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { unsigned long delta; ktime_t soft, hard; @@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, HRTIMER_MODE_ABS_PINNED, 0); } - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } #ifdef CONFIG_RT_GROUP_SCHED @@ -470,7 +470,7 @@ struct rt_rq { u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ - spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -9366,7 +9366,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; - spin_lock_init(&rt_rq->rt_runtime_lock); + raw_spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -10305,18 +10305,18 @@ static int tg_set_bandwidth(struct task_group *tg, if (err) goto unlock; - spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); tg->rt_bandwidth.rt_runtime = rt_runtime; for_each_possible_cpu(i) { struct rt_rq *rt_rq = tg->rt_rq[i]; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_runtime; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -10421,15 +10421,15 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_runtime == 0) return -EBUSY; - spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = global_rt_runtime(); - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); return 0; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a8325a7ff94c..d2ea2828164e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -327,7 +327,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) weight = cpumask_weight(rd->span); - spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); @@ -336,7 +336,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) if (iter == rt_rq) continue; - spin_lock(&iter->rt_runtime_lock); + raw_spin_lock(&iter->rt_runtime_lock); /* * Either all rqs have inf runtime and there's nothing to steal * or __disable_runtime() below sets a specific rq to inf to @@ -358,14 +358,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq) rt_rq->rt_runtime += diff; more = 1; if (rt_rq->rt_runtime == rt_period) { - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); break; } } next: - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); } - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); return more; } @@ -386,8 +386,8 @@ static void __disable_runtime(struct rq *rq) s64 want; int i; - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); /* * Either we're all inf and nobody needs to borrow, or we're * already disabled and thus have nothing to do, or we have @@ -396,7 +396,7 @@ static void __disable_runtime(struct rq *rq) if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); /* * Calculate the difference between what we started out with @@ -418,7 +418,7 @@ static void __disable_runtime(struct rq *rq) if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; - spin_lock(&iter->rt_runtime_lock); + raw_spin_lock(&iter->rt_runtime_lock); if (want > 0) { diff = min_t(s64, iter->rt_runtime, want); iter->rt_runtime -= diff; @@ -427,13 +427,13 @@ static void __disable_runtime(struct rq *rq) iter->rt_runtime -= want; want -= want; } - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); if (!want) break; } - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); /* * We cannot be left wanting - that would mean some runtime * leaked out of the system. @@ -445,8 +445,8 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } } @@ -472,13 +472,13 @@ static void __enable_runtime(struct rq *rq) for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } } @@ -496,9 +496,9 @@ static int balance_runtime(struct rt_rq *rt_rq) int more = 0; if (rt_rq->rt_time > rt_rq->rt_runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); } return more; @@ -528,7 +528,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time) { u64 runtime; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); if (rt_rq->rt_throttled) balance_runtime(rt_rq); runtime = rt_rq->rt_runtime; @@ -539,7 +539,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } else if (rt_rq->rt_nr_running) idle = 0; @@ -624,11 +624,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } } } -- cgit v1.2.1 From fe841226bd954fba4fd79f037a876053fe9c3217 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 17:59:15 +0100 Subject: sched: Convert cpupri lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/sched_cpupri.c | 10 +++++----- kernel/sched_cpupri.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 0f052fc674d5..597b33099dfa 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -135,26 +135,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - spin_lock_irqsave(&vec->lock, flags); + raw_spin_lock_irqsave(&vec->lock, flags); cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); - spin_unlock_irqrestore(&vec->lock, flags); + raw_spin_unlock_irqrestore(&vec->lock, flags); } if (likely(oldpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - spin_lock_irqsave(&vec->lock, flags); + raw_spin_lock_irqsave(&vec->lock, flags); vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); cpumask_clear_cpu(cpu, vec->mask); - spin_unlock_irqrestore(&vec->lock, flags); + raw_spin_unlock_irqrestore(&vec->lock, flags); } *currpri = newpri; @@ -180,7 +180,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - spin_lock_init(&vec->lock); + raw_spin_lock_init(&vec->lock); vec->count = 0; if (!zalloc_cpumask_var(&vec->mask, gfp)) goto cleanup; diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9a7e859b8fbf..7cb5bb6b95be 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -12,7 +12,7 @@ /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - spinlock_t lock; + raw_spinlock_t lock; int count; cpumask_var_t mask; }; -- cgit v1.2.1 From 1d615482547584b9a8bb6316a58fed6ce90dd9ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 14:54:03 +0100 Subject: sched: Convert pi_lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/exit.c | 2 +- kernel/fork.c | 4 ++-- kernel/futex.c | 38 ++++++++++++++++----------------- kernel/rtmutex-debug.c | 4 ++-- kernel/rtmutex.c | 58 +++++++++++++++++++++++++------------------------- kernel/sched.c | 12 +++++------ 6 files changed, 59 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 6f50ef55a6f3..5962d7ccf243 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -933,7 +933,7 @@ NORET_TYPE void do_exit(long code) * an exiting task cleaning up the robust pi futexes. */ smp_mb(); - spin_unlock_wait(&tsk->pi_lock); + raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", diff --git a/kernel/fork.c b/kernel/fork.c index 1415dc4598ae..9bd91447e052 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -939,9 +939,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) static void rt_mutex_init_task(struct task_struct *p) { - spin_lock_init(&p->pi_lock); + raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters, &p->pi_lock); + plist_head_init_raw(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; #endif } diff --git a/kernel/futex.c b/kernel/futex.c index 6af474df17bb..320b369d20b5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -403,9 +403,9 @@ static void free_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); } @@ -470,18 +470,18 @@ void exit_pi_state_list(struct task_struct *curr) * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: @@ -495,15 +495,15 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); rt_mutex_unlock(&pi_state->pi_mutex); spin_unlock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); } - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); } static int @@ -558,7 +558,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * change of the task flags, we do this protected by * p->pi_lock: */ - spin_lock_irq(&p->pi_lock); + raw_spin_lock_irq(&p->pi_lock); if (unlikely(p->flags & PF_EXITING)) { /* * The task is on the way out. When PF_EXITPIDONE is @@ -567,7 +567,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; - spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); return ret; } @@ -586,7 +586,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; - spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -794,16 +794,16 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) } } - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); - spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock_irq(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock_irq(&new_owner->pi_lock); spin_unlock(&pi_state->pi_mutex.wait_lock); rt_mutex_unlock(&pi_state->pi_mutex); @@ -1529,18 +1529,18 @@ retry: * itself. */ if (pi_state->owner != NULL) { - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - spin_lock_irq(&newowner->pi_lock); + raw_spin_lock_irq(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock_irq(&newowner->pi_lock); return 0; /* diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 5fcb4fe645e2..ddabb54bb5c8 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -37,8 +37,8 @@ do { \ if (rt_trace_on) { \ rt_trace_on = 0; \ console_verbose(); \ - if (spin_is_locked(¤t->pi_lock)) \ - spin_unlock(¤t->pi_lock); \ + if (raw_spin_is_locked(¤t->pi_lock)) \ + raw_spin_unlock(¤t->pi_lock); \ } \ } while (0) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 29bd4baf9e75..d33da470f9da 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task) { unsigned long flags; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); } /* @@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Task can not go away as we did a get_task() before ! */ - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; /* @@ -232,7 +232,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, lock = waiter->lock; if (!spin_trylock(&lock->wait_lock)) { - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); cpu_relax(); goto retry; } @@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, plist_add(&waiter->list_entry, &lock->wait_list); /* Release the task */ - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); put_task_struct(task); /* Grab the next task */ task = rt_mutex_owner(lock); get_task_struct(task); - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ @@ -277,7 +277,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); spin_unlock(&lock->wait_lock); @@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); out_put_task: put_task_struct(task); @@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, if (pendowner == task) return 1; - spin_lock_irqsave(&pendowner->pi_lock, flags); + raw_spin_lock_irqsave(&pendowner->pi_lock, flags); if (task->prio >= pendowner->prio) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * priority. */ if (likely(!rt_mutex_has_waiters(lock))) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 1; } @@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, next = rt_mutex_top_waiter(lock); plist_del(&next->pi_list_entry, &pendowner->pi_waiters); __rt_mutex_adjust_prio(pendowner); - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); /* * We are going to steal the lock and a waiter was @@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * might be task: */ if (likely(next->task != task)) { - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); plist_add(&next->pi_list_entry, &task->pi_waiters); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { - spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) chain_walk = 1; @@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) struct task_struct *pendowner; unsigned long flags; - spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock_irqsave(¤t->pi_lock, flags); waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); /* * Clear the pi_blocked_on variable and enqueue a possible @@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * waiter with higher priority than pending-owner->normal_prio * is blocked on the unboosted (pending) owner. */ - spin_lock_irqsave(&pendowner->pi_lock, flags); + raw_spin_lock_irqsave(&pendowner->pi_lock, flags); WARN_ON(!pendowner->pi_blocked_on); WARN_ON(pendowner->pi_blocked_on != waiter); @@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) next = rt_mutex_top_waiter(lock); plist_add(&next->pi_list_entry, &pendowner->pi_waiters); } - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); wake_up_process(pendowner); } @@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0; - spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); waiter->task = NULL; current->pi_blocked_on = NULL; - spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); if (first && owner != current) { - spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&waiter->pi_list_entry, &owner->pi_waiters); @@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock, if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); @@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task) struct rt_mutex_waiter *waiter; unsigned long flags; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; if (!waiter || waiter->list_entry.prio == task->prio) { - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); diff --git a/kernel/sched.c b/kernel/sched.c index 01c5016e57f1..18cceeecce35 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6323,7 +6323,7 @@ recheck: * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: */ - spin_lock_irqsave(&p->pi_lock, flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. @@ -6333,7 +6333,7 @@ recheck: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } update_rq_clock(rq); @@ -6357,7 +6357,7 @@ recheck: check_class_changed(rq, p, prev_class, oldprio, running); } __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); rt_mutex_adjust_pi(p); @@ -9624,7 +9624,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); #endif /* @@ -9749,13 +9749,13 @@ void normalize_rt_tasks(void) continue; } - spin_lock(&p->pi_lock); + raw_spin_lock(&p->pi_lock); rq = __task_rq_lock(p); normalize_task(rq, p); __task_rq_unlock(rq); - spin_unlock(&p->pi_lock); + raw_spin_unlock(&p->pi_lock); } while_each_thread(g, p); read_unlock_irqrestore(&tasklist_lock, flags); -- cgit v1.2.1 From d209d74d52ab39dc071656533cac095294f70de7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 18:22:11 +0100 Subject: rtmutes: Convert rtmutex.lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/futex.c | 6 +++--- kernel/rtmutex.c | 48 ++++++++++++++++++++++++------------------------ 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 320b369d20b5..8e3c3ffe1b9a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -760,7 +760,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; - spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -789,7 +789,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) else if (curval != uval) ret = -EINVAL; if (ret) { - spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; } } @@ -805,7 +805,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) pi_state->owner = new_owner; raw_spin_unlock_irq(&new_owner->pi_lock); - spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); rt_mutex_unlock(&pi_state->pi_mutex); return 0; diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index d33da470f9da..a9604815786a 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -231,7 +231,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; lock = waiter->lock; - if (!spin_trylock(&lock->wait_lock)) { + if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); cpu_relax(); goto retry; @@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Deadlock detection */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); ret = deadlock_detect ? -EDEADLK : 0; goto out_unlock_pi; } @@ -280,7 +280,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, task); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); return res; } @@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); } /* @@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); if (waiter->task) schedule_rt_mutex(lock); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); set_current_state(state); } @@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return 0; } @@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* Remove pending timer: */ if (unlikely(timeout)) @@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) { int ret = 0; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); if (likely(rt_mutex_owner(lock) != current)) { @@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) fixup_rt_mutex_waiters(lock); } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return ret; } @@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); debug_rt_mutex_unlock(lock); @@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock) if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return; } wakeup_next_waiter(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* Undo pi boosting if necessary: */ rt_mutex_adjust_prio(current); @@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; - spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list, &lock->wait_lock); + raw_spin_lock_init(&lock->wait_lock); + plist_head_init_raw(&lock->wait_list, &lock->wait_lock); debug_rt_mutex_init(lock, name); } @@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); mark_rt_mutex_waiters(lock); @@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, /* We got the lock for task. */ debug_rt_mutex_lock(lock); rt_mutex_set_owner(lock, task, 0); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } @@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* * Readjust priority, when we did not get the lock. We might have been -- cgit v1.2.1 From 9f5a5621e78cf48d86682a71ceb3fcdbde38b222 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 15:40:01 +0100 Subject: smp: Convert smplocks to raw_spinlocks Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/smp.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index a8c76069cf50..6e7c7fdcd9b4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); static struct { struct list_head queue; - spinlock_t lock; + raw_spinlock_t lock; } call_function __cacheline_aligned_in_smp = { .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), }; enum { @@ -35,7 +35,7 @@ struct call_function_data { struct call_single_queue { struct list_head list; - spinlock_t lock; + raw_spinlock_t lock; }; static DEFINE_PER_CPU(struct call_function_data, cfd_data); @@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void) for_each_possible_cpu(i) { struct call_single_queue *q = &per_cpu(call_single_queue, i); - spin_lock_init(&q->lock); + raw_spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } @@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) unsigned long flags; int ipi; - spin_lock_irqsave(&dst->lock, flags); + raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&data->list, &dst->list); - spin_unlock_irqrestore(&dst->lock, flags); + raw_spin_unlock_irqrestore(&dst->lock, flags); /* * The list addition should be visible before sending the IPI @@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void) refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); if (!refs) { - spin_lock(&call_function.lock); + raw_spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); - spin_unlock(&call_function.lock); + raw_spin_unlock(&call_function.lock); } if (refs) @@ -230,9 +230,9 @@ void generic_smp_call_function_single_interrupt(void) */ WARN_ON_ONCE(!cpu_online(smp_processor_id())); - spin_lock(&q->lock); + raw_spin_lock(&q->lock); list_replace_init(&q->list, &list); - spin_unlock(&q->lock); + raw_spin_unlock(&q->lock); while (!list_empty(&list)) { struct call_single_data *data; @@ -449,14 +449,14 @@ void smp_call_function_many(const struct cpumask *mask, cpumask_clear_cpu(this_cpu, data->cpumask); atomic_set(&data->refs, cpumask_weight(data->cpumask)); - spin_lock_irqsave(&call_function.lock, flags); + raw_spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock_irqrestore(&call_function.lock, flags); + raw_spin_unlock_irqrestore(&call_function.lock, flags); /* * Make the list addition visible before sending the ipi. @@ -501,20 +501,20 @@ EXPORT_SYMBOL(smp_call_function); void ipi_call_lock(void) { - spin_lock(&call_function.lock); + raw_spin_lock(&call_function.lock); } void ipi_call_unlock(void) { - spin_unlock(&call_function.lock); + raw_spin_unlock(&call_function.lock); } void ipi_call_lock_irq(void) { - spin_lock_irq(&call_function.lock); + raw_spin_lock_irq(&call_function.lock); } void ipi_call_unlock_irq(void) { - spin_unlock_irq(&call_function.lock); + raw_spin_unlock_irq(&call_function.lock); } -- cgit v1.2.1 From 239007b8440abff689632f50cdf0f2b9e895b534 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 16:46:45 +0100 Subject: genirq: Convert irq_desc.lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/irq/autoprobe.c | 20 +++++------ kernel/irq/chip.c | 86 +++++++++++++++++++++++------------------------ kernel/irq/handle.c | 22 ++++++------ kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 50 +++++++++++++-------------- kernel/irq/migration.c | 2 +- kernel/irq/numa_migrate.c | 8 ++--- kernel/irq/pm.c | 8 ++--- kernel/irq/proc.c | 4 +-- kernel/irq/spurious.c | 14 ++++---- 10 files changed, 108 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 1de9700f416e..2295a31ef110 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -45,7 +45,7 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* * An old-style architecture might still have @@ -61,7 +61,7 @@ unsigned long probe_irq_on(void) desc->chip->set_type(i, IRQ_TYPE_PROBE); desc->chip->startup(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } /* Wait for longstanding interrupts to trigger. */ @@ -73,13 +73,13 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; if (desc->chip->startup(i)) desc->status |= IRQ_PENDING; } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } /* @@ -91,7 +91,7 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -103,7 +103,7 @@ unsigned long probe_irq_on(void) if (i < 32) mask |= 1 << i; } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } return mask; @@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val) desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); @@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val) unsigned int status; for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val) desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ba566c261adc..ecc3fa28f666 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq) } /* Ensure we don't have left over values from a previous use of this irq */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; desc->handle_irq = handle_bad_irq; @@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq) cpumask_clear(desc->pending_mask); #endif #endif - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } /** @@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (desc->action) { - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", irq); return; @@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->chip = &no_irq_chip; desc->name = NULL; clear_kstat_irqs(desc); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } @@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type) if (type == IRQ_TYPE_NONE) return 0; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); ret = __irq_set_trigger(desc, irq, type); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_type); @@ -158,9 +158,9 @@ int set_irq_data(unsigned int irq, void *data) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(set_irq_data); @@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) entry->irq = irq; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->chip_data = data; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest) if (!desc) return; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (nest) desc->status |= IRQ_NESTED_THREAD; else desc->status &= ~IRQ_NESTED_THREAD; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL_GPL(set_irq_nested_thread); @@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq) might_sleep(); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); kstat_incr_irqs_this_cpu(irq, desc); @@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } EXPORT_SYMBOL_GPL(handle_nested_irq); @@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** @@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); mask_ack_irq(desc, irq); if (unlikely(desc->status & IRQ_INPROGRESS)) @@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; if (unlikely(desc->status & IRQ_ONESHOT)) @@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) desc->chip->unmask(irq); out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out; @@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status |= IRQ_INPROGRESS; desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** @@ -530,7 +530,7 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -576,17 +576,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) } desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** @@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, } chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ if (handle == handle_bad_irq) { @@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->depth = 0; desc->chip->startup(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } void __init set_irq_probe(unsigned int irq) @@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 17c71bb565c6..814940e7f485 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -80,7 +80,7 @@ static struct irq_desc irq_desc_init = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) @@ -108,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - spin_lock_init(&desc->lock); + raw_spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP desc->node = node; @@ -130,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) /* * Protect the sparse_irqs: */ -DEFINE_SPINLOCK(sparse_irq_lock); +DEFINE_RAW_SPINLOCK(sparse_irq_lock); struct irq_desc **irq_desc_ptrs __read_mostly; @@ -141,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), } }; @@ -212,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) if (desc) return desc; - spin_lock_irqsave(&sparse_irq_lock, flags); + raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ desc = irq_desc_ptrs[irq]; @@ -234,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) irq_desc_ptrs[irq] = desc; out_unlock: - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); return desc; } @@ -247,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; @@ -473,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq) return 1; } - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (desc->chip->ack) desc->chip->ack(irq); /* @@ -517,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq) for (;;) { irqreturn_t action_ret; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; @@ -536,7 +536,7 @@ out: * disabled while the handler was running. */ desc->chip->end(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return 1; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 1b5d742c6a77..b2821f070a3d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); -extern spinlock_t sparse_irq_lock; +extern raw_spinlock_t sparse_irq_lock; #ifdef CONFIG_SPARSE_IRQ /* irq_desc_ptrs allocated at boot time */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7305b297d1eb..eb6078ca60c7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); status = desc->status; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ } while (status & IRQ_INPROGRESS); @@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (!desc->chip->set_affinity) return -EINVAL; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { @@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) } #endif desc->status |= IRQ_AFFINITY_SET; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) irq_set_thread_affinity(desc); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq) return; chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -308,9 +308,9 @@ void enable_irq(unsigned int irq) return; chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(enable_irq); @@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); @@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_wake); @@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { chip_bus_lock(irq, desc); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(irq, desc); } @@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) return; } - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); cpumask_copy(mask, desc->affinity); - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); @@ -545,7 +545,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (unlikely(desc->status & IRQ_DISABLED)) { /* * CHECKME: We might need a dedicated @@ -555,9 +555,9 @@ static int irq_thread(void *data) * retriggers the interrupt itself --- tglx */ desc->status |= IRQ_PENDING; - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } else { - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); action->thread_fn(action->irq, action->dev_id); @@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * The following block of code has to be executed atomically */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { @@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) __enable_irq(desc, irq, false); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Strictly no need to wake it up, but hung_task complains @@ -802,7 +802,7 @@ mismatch: ret = -EBUSY; out_thread: - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { struct task_struct *t = new->thread; @@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right @@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return NULL; } @@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index fcb6c96f2627..241962280836 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -27,7 +27,7 @@ void move_masked_irq(int irq) if (!desc->chip->set_affinity) return; - assert_spin_locked(&desc->lock); + assert_raw_spin_locked(&desc->lock); /* * If there was a valid mask to work with, please diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 3fd30197da2e..26bac9d8f860 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, "for migration.\n", irq); return false; } - spin_lock_init(&desc->lock); + raw_spin_lock_init(&desc->lock); desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); @@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, irq = old_desc->irq; - spin_lock_irqsave(&sparse_irq_lock, flags); + raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ desc = irq_desc_ptrs[irq]; @@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, } irq_desc_ptrs[irq] = desc; - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); /* free the old one */ free_one_irq_desc(old_desc, desc); @@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, return desc; out_unlock: - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); return desc; } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index a0bb09e79867..0d4005d85b03 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -28,9 +28,9 @@ void suspend_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, true); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } for_each_irq_desc(irq, desc) @@ -56,9 +56,9 @@ void resume_device_irqs(void) if (!(desc->status & IRQ_SUSPENDED)) continue; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } } EXPORT_SYMBOL_GPL(resume_device_irqs); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 0832145fea97..6f50eccc79c0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -179,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) unsigned long flags; int ret = 1; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); for (action = desc->action ; action; action = action->next) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { @@ -187,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) break; } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index e49ea1c5232d..89fb90ae534f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) struct irqaction *action; int ok = 0, work = 0; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); /* Already running on another processor */ if (desc->status & IRQ_INPROGRESS) { /* @@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc) */ if (desc->action && (desc->action->flags & IRQF_SHARED)) desc->status |= IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return ok; } /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; action = desc->action; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); while (action) { /* Only shared IRQ handlers are safe to call */ @@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) } local_irq_disable(); /* Now clean up the flags */ - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); action = desc->action; /* @@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc) * Perform real IRQ processing for the IRQ we deferred */ work = 1; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); handle_IRQ_event(irq, action); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_PENDING; } desc->status &= ~IRQ_INPROGRESS; @@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) */ if (work && desc->chip && desc->chip->end) desc->chip->end(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return ok; } -- cgit v1.2.1 From ecb49d1a639acbacfc3771cae5ec07bed5df3847 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 16:36:54 +0100 Subject: hrtimers: Convert to raw_spinlocks Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/hrtimer.c | 50 +++++++++++++++++++++++------------------------ kernel/time/timer_list.c | 6 +++--- kernel/time/timer_stats.c | 17 ++++++++-------- 3 files changed, 37 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d2f9239dc6ba..0086628b6e97 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->cpu_base->lock, *flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } @@ -208,13 +208,13 @@ again: /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->cpu_base->lock); - spin_lock(&new_base->cpu_base->lock); + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { cpu = this_cpu; - spin_unlock(&new_base->cpu_base->lock); - spin_lock(&base->cpu_base->lock); + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); timer->base = base; goto again; } @@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->cpu_base->lock, *flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } @@ -628,12 +628,12 @@ static void retrigger_next_event(void *arg) base = &__get_cpu_var(hrtimer_bases); /* Adjust CLOCK_REALTIME offset */ - spin_lock(&base->lock); + raw_spin_lock(&base->lock); base->clock_base[CLOCK_REALTIME].offset = timespec_to_ktime(realtime_offset); hrtimer_force_reprogram(base, 0); - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); } /* @@ -694,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { if (wakeup) { - spin_unlock(&base->cpu_base->lock); + raw_spin_unlock(&base->cpu_base->lock); raise_softirq_irqoff(HRTIMER_SOFTIRQ); - spin_lock(&base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); } else __raise_softirq_irqoff(HRTIMER_SOFTIRQ); @@ -790,7 +790,7 @@ static inline void timer_stats_account_hrtimer(struct hrtimer *timer) static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); + raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -1123,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void) unsigned long flags; int i; - spin_lock_irqsave(&cpu_base->lock, flags); + raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!hrtimer_hres_active()) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { @@ -1140,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void) } } - spin_unlock_irqrestore(&cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); if (mindelta.tv64 < 0) mindelta.tv64 = 0; @@ -1222,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) * they get migrated to another cpu, therefore its safe to unlock * the timer base. */ - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); trace_hrtimer_expire_entry(timer, now); restart = fn(timer); trace_hrtimer_expire_exit(timer); - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); /* * Note: We clear the CALLBACK bit after enqueue_hrtimer and @@ -1261,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) retry: expires_next.tv64 = KTIME_MAX; - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1317,7 +1317,7 @@ retry: * against it. */ cpu_base->expires_next = expires_next; - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ if (expires_next.tv64 == KTIME_MAX || @@ -1457,7 +1457,7 @@ void hrtimer_run_queues(void) gettime = 0; } - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; @@ -1469,7 +1469,7 @@ void hrtimer_run_queues(void) __run_hrtimer(timer, &base->softirq_time); } - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); } } @@ -1625,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - spin_lock_init(&cpu_base->lock); + raw_spin_lock_init(&cpu_base->lock); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) cpu_base->clock_base[i].cpu_base = cpu_base; @@ -1683,16 +1683,16 @@ static void migrate_hrtimers(int scpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 9d80db4747d4..28265636b6c2 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, next_one: i = 0; - spin_lock_irqsave(&base->cpu_base->lock, flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = base->first; /* @@ -100,13 +100,13 @@ next_one: timer = rb_entry(curr, struct hrtimer, node); tmp = *timer; - spin_unlock_irqrestore(&base->cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); print_timer(m, timer, &tmp, i, now); next++; goto next_one; } - spin_unlock_irqrestore(&base->cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); } static void diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 63b117e9eba1..2f3b585b8d7d 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: */ -static DEFINE_PER_CPU(spinlock_t, tstats_lookup_lock); +static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock); /* * Mutex to serialize state changes with show-stats activities: @@ -238,7 +238,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, /* * It doesnt matter which lock we take: */ - spinlock_t *lock; + raw_spinlock_t *lock; struct entry *entry, input; unsigned long flags; @@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.pid = pid; input.timer_flag = timer_flag; - spin_lock_irqsave(lock, flags); + raw_spin_lock_irqsave(lock, flags); if (!timer_stats_active) goto out_unlock; @@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, atomic_inc(&overflow_count); out_unlock: - spin_unlock_irqrestore(lock, flags); + raw_spin_unlock_irqrestore(lock, flags); } static void print_name_offset(struct seq_file *m, unsigned long addr) @@ -348,10 +348,11 @@ static void sync_access(void) int cpu; for_each_online_cpu(cpu) { - spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); - spin_lock_irqsave(lock, flags); + raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); + + raw_spin_lock_irqsave(lock, flags); /* nothing */ - spin_unlock_irqrestore(lock, flags); + raw_spin_unlock_irqrestore(lock, flags); } } @@ -409,7 +410,7 @@ void __init init_timer_stats(void) int cpu; for_each_possible_cpu(cpu) - spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); + raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); } static int __init init_tstats_procfs(void) -- cgit v1.2.1 From e625cce1b73fb38b74e5387226534f7bcbfc36fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 18:02:06 +0100 Subject: perf_event: Convert to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/hw_breakpoint.c | 4 +- kernel/perf_event.c | 106 ++++++++++++++++++++++++------------------------- 2 files changed, 55 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 366eedf949c0..dbcbf6a33a08 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -96,7 +96,7 @@ static int task_bp_pinned(struct task_struct *tsk) list = &ctx->event_list; - spin_lock_irqsave(&ctx->lock, flags); + raw_spin_lock_irqsave(&ctx->lock, flags); /* * The current breakpoint counter is not included in the list @@ -107,7 +107,7 @@ static int task_bp_pinned(struct task_struct *tsk) count++; } - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); return count; } diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e73e53c7582f..9052d6c8c9fd 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -203,14 +203,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * if so. If we locked the right context, then it * can't get swapped on us any more. */ - spin_lock_irqsave(&ctx->lock, *flags); + raw_spin_lock_irqsave(&ctx->lock, *flags); if (ctx != rcu_dereference(task->perf_event_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock_irqrestore(&ctx->lock, *flags); ctx = NULL; } } @@ -231,7 +231,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } @@ -240,9 +240,9 @@ static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; - spin_lock_irqsave(&ctx->lock, flags); + raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); put_ctx(ctx); } @@ -427,7 +427,7 @@ static void __perf_event_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the * events on a global level. @@ -449,7 +449,7 @@ static void __perf_event_remove_from_context(void *info) } perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } @@ -488,12 +488,12 @@ retry: task_oncpu_function_call(task, __perf_event_remove_from_context, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ if (ctx->nr_active && !list_empty(&event->group_entry)) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -504,7 +504,7 @@ retry: */ if (!list_empty(&event->group_entry)) list_del_event(event, ctx); - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } /* @@ -535,7 +535,7 @@ static void __perf_event_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * If the event is on, turn it off. @@ -551,7 +551,7 @@ static void __perf_event_disable(void *info) event->state = PERF_EVENT_STATE_OFF; } - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -584,12 +584,12 @@ void perf_event_disable(struct perf_event *event) retry: task_oncpu_function_call(task, __perf_event_disable, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the event is still active, we need to retry the cross-call. */ if (event->state == PERF_EVENT_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -602,7 +602,7 @@ void perf_event_disable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; } - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } static int @@ -770,7 +770,7 @@ static void __perf_install_in_context(void *info) cpuctx->task_ctx = ctx; } - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -820,7 +820,7 @@ static void __perf_install_in_context(void *info) unlock: perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -856,12 +856,12 @@ retry: task_oncpu_function_call(task, __perf_install_in_context, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * we need to retry the smp call. */ if (ctx->is_active && list_empty(&event->group_entry)) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -872,7 +872,7 @@ retry: */ if (list_empty(&event->group_entry)) add_event_to_ctx(event, ctx); - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } /* @@ -917,7 +917,7 @@ static void __perf_event_enable(void *info) cpuctx->task_ctx = ctx; } - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -959,7 +959,7 @@ static void __perf_event_enable(void *info) } unlock: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -985,7 +985,7 @@ void perf_event_enable(struct perf_event *event) return; } - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE) goto out; @@ -1000,10 +1000,10 @@ void perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; retry: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); task_oncpu_function_call(task, __perf_event_enable, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the context is active and the event is still off, @@ -1020,7 +1020,7 @@ void perf_event_enable(struct perf_event *event) __perf_event_mark_enabled(event, ctx); out: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } static int perf_event_refresh(struct perf_event *event, int refresh) @@ -1042,7 +1042,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx, { struct perf_event *event; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; @@ -1055,7 +1055,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx, } perf_enable(); out: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1193,8 +1193,8 @@ void perf_event_task_sched_out(struct task_struct *task, * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&ctx->lock); + raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { /* * XXX do we need a memory barrier of sorts @@ -1208,8 +1208,8 @@ void perf_event_task_sched_out(struct task_struct *task, perf_event_sync_stat(ctx, next_ctx); } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); + raw_spin_unlock(&next_ctx->lock); + raw_spin_unlock(&ctx->lock); } rcu_read_unlock(); @@ -1251,7 +1251,7 @@ __perf_event_sched_in(struct perf_event_context *ctx, struct perf_event *event; int can_add_hw = 1; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) goto out; @@ -1306,7 +1306,7 @@ __perf_event_sched_in(struct perf_event_context *ctx, } perf_enable(); out: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1370,7 +1370,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) struct hw_perf_event *hwc; u64 interrupts, freq; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -1425,7 +1425,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) perf_enable(); } } - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1438,7 +1438,7 @@ static void rotate_ctx(struct perf_event_context *ctx) if (!ctx->nr_events) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * Rotate the first entry last (works just fine for group events too): */ @@ -1449,7 +1449,7 @@ static void rotate_ctx(struct perf_event_context *ctx) } perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } void perf_event_task_tick(struct task_struct *curr, int cpu) @@ -1498,7 +1498,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) __perf_event_task_sched_out(ctx); - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); list_for_each_entry(event, &ctx->group_list, group_entry) { if (!event->attr.enable_on_exec) @@ -1516,7 +1516,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) if (enabled) unclone_ctx(ctx); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); perf_event_task_sched_in(task, smp_processor_id()); out: @@ -1542,10 +1542,10 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); update_context_time(ctx); update_event_times(event); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); event->pmu->read(event); } @@ -1563,10 +1563,10 @@ static u64 perf_event_read(struct perf_event *event) struct perf_event_context *ctx = event->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->lock, flags); + raw_spin_lock_irqsave(&ctx->lock, flags); update_context_time(ctx); update_event_times(event); - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } return atomic64_read(&event->count); @@ -1579,7 +1579,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { - spin_lock_init(&ctx->lock); + raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->group_list); INIT_LIST_HEAD(&ctx->event_list); @@ -1649,7 +1649,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) ctx = perf_lock_task_context(task, &flags); if (ctx) { unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } if (!ctx) { @@ -1987,7 +1987,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (!value) return -EINVAL; - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); if (event->attr.freq) { if (value > sysctl_perf_event_sample_rate) { ret = -EINVAL; @@ -2000,7 +2000,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->hw.sample_period = value; } unlock: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); return ret; } @@ -4992,7 +4992,7 @@ void perf_event_exit_task(struct task_struct *child) * reading child->perf_event_ctxp, we wait until it has * incremented the context's refcount before we do put_ctx below. */ - spin_lock(&child_ctx->lock); + raw_spin_lock(&child_ctx->lock); child->perf_event_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get @@ -5001,7 +5001,7 @@ void perf_event_exit_task(struct task_struct *child) */ unclone_ctx(child_ctx); update_context_time(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Report the task dead after unscheduling the events so that we @@ -5292,11 +5292,11 @@ perf_set_reserve_percpu(struct sysdev_class *class, perf_reserved_percpu = val; for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); + raw_spin_lock_irq(&cpuctx->ctx.lock); mpt = min(perf_max_events - cpuctx->ctx.nr_events, perf_max_events - perf_reserved_percpu); cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); + raw_spin_unlock_irq(&cpuctx->ctx.lock); } spin_unlock(&perf_resource_lock); -- cgit v1.2.1 From d192c47f25daa98996c7eae543d8a27e41247ec2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Dec 2009 12:49:26 +0100 Subject: clockevents: Make tick_device_lock static Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/time/tick-common.c | 2 +- kernel/time/tick-internal.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 83c4417b6a3c..af39cf1cfa50 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -DEFINE_SPINLOCK(tick_device_lock); +static DEFINE_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b1c05bf75ee0..290eefbc1f60 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,7 +6,6 @@ #define TICK_DO_TIMER_BOOT -2 DECLARE_PER_CPU(struct tick_device, tick_cpu_device); -extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; extern ktime_t tick_period; extern int tick_do_timer_cpu __read_mostly; -- cgit v1.2.1 From b5f91da0a6973bb6f9ff3b91b0e92c0773a458f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Dec 2009 12:40:31 +0100 Subject: clockevents: Convert to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- kernel/time/clockevents.c | 14 +++++++------- kernel/time/tick-broadcast.c | 42 +++++++++++++++++++++--------------------- kernel/time/tick-common.c | 20 ++++++++++---------- 3 files changed, 38 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 20a8920029ee..3d5fc0fd1cca 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -30,7 +30,7 @@ static LIST_HEAD(clockevents_released); static RAW_NOTIFIER_HEAD(clockevents_chain); /* Protection for the above */ -static DEFINE_SPINLOCK(clockevents_lock); +static DEFINE_RAW_SPINLOCK(clockevents_lock); /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -141,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); ret = raw_notifier_chain_register(&clockevents_chain, nb); - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); return ret; } @@ -185,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(!dev->cpumask); - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); clockevents_notify_released(); - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); @@ -241,7 +241,7 @@ void clockevents_notify(unsigned long reason, void *arg) struct list_head *node, *tmp; unsigned long flags; - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); clockevents_do_notify(reason, arg); switch (reason) { @@ -256,7 +256,7 @@ void clockevents_notify(unsigned long reason, void *arg) default: break; } - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); #endif diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c2ec25087a35..b3bafd5fc66d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device; /* FIXME: Use cpumask_var_t. */ static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); static DECLARE_BITMAP(tmpmask, NR_CPUS); -static DEFINE_SPINLOCK(tick_broadcast_lock); +static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; #ifdef CONFIG_TICK_ONESHOT @@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) unsigned long flags; int ret = 0; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Devices might be registered with both periodic and oneshot @@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) tick_broadcast_clear_oneshot(cpu); } } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } @@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - spin_lock(&tick_broadcast_lock); + raw_spin_lock(&tick_broadcast_lock); cpumask_and(to_cpumask(tmpmask), cpu_online_mask, tick_get_broadcast_mask()); tick_do_broadcast(to_cpumask(tmpmask)); - spin_unlock(&tick_broadcast_lock); + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) unsigned long flags; int cpu, bc_stopped; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); @@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) tick_broadcast_setup_oneshot(bc); } out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) unsigned long flags; unsigned int cpu = *cpup; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); @@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) clockevents_shutdown(bc); } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } void tick_suspend_broadcast(void) @@ -317,13 +317,13 @@ void tick_suspend_broadcast(void) struct clock_event_device *bc; unsigned long flags; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) clockevents_shutdown(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } int tick_resume_broadcast(void) @@ -332,7 +332,7 @@ int tick_resume_broadcast(void) unsigned long flags; int broadcast = 0; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; @@ -351,7 +351,7 @@ int tick_resume_broadcast(void) break; } } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return broadcast; } @@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) ktime_t now, next_event; int cpu; - spin_lock(&tick_broadcast_lock); + raw_spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; @@ -443,7 +443,7 @@ again: if (tick_broadcast_set_event(next_event, 0)) goto again; } - spin_unlock(&tick_broadcast_lock); + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) unsigned long flags; int cpu; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Periodic mode does not care about the enter/exit of power @@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void) struct clock_event_device *bc; unsigned long flags; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) unsigned long flags; unsigned int cpu = *cpup; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Clear the broadcast mask flag for the dead cpu, but do not @@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) */ cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index af39cf1cfa50..b6b898d2eeef 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_SPINLOCK(tick_device_lock); +static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) int cpu, ret = NOTIFY_OK; unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); return NOTIFY_STOP; out_bc: @@ -278,7 +278,7 @@ out_bc: if (tick_check_broadcast_device(newdev)) ret = NOTIFY_STOP; - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); return ret; } @@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup) struct clock_event_device *dev = td->evtdev; unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) @@ -330,9 +330,9 @@ static void tick_suspend(void) struct tick_device *td = &__get_cpu_var(tick_cpu_device); unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) @@ -341,7 +341,7 @@ static void tick_resume(void) unsigned long flags; int broadcast = tick_resume_broadcast(); - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -350,7 +350,7 @@ static void tick_resume(void) else tick_resume_oneshot(); } - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } /* -- cgit v1.2.1 From d4703aefdbc8f9f347f6dcefcddd791294314eb7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 15 Dec 2009 16:28:32 -0600 Subject: module: handle ppc64 relocating kcrctabs when CONFIG_RELOCATABLE=y powerpc applies relocations to the kcrctab. They're absolute symbols, but it's not completely unreasonable: other archs may too, but the relocation is often 0. http://lists.ozlabs.org/pipermail/linuxppc-dev/2009-November/077972.html Inspired-by: Neil Horman Signed-off-by: Rusty Russell Tested-by: Neil Horman Acked-by: Paul Mackerras --- kernel/module.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 12afc5a3ddd3..a65dc787a27b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -880,11 +880,23 @@ static int try_to_force_load(struct module *mod, const char *reason) } #ifdef CONFIG_MODVERSIONS +/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */ +static unsigned long maybe_relocated(unsigned long crc, + const struct module *crc_owner) +{ +#ifdef ARCH_RELOCATES_KCRCTAB + if (crc_owner == NULL) + return crc - (unsigned long)reloc_start; +#endif + return crc; +} + static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, struct module *mod, - const unsigned long *crc) + const unsigned long *crc, + const struct module *crc_owner) { unsigned int i, num_versions; struct modversion_info *versions; @@ -905,10 +917,10 @@ static int check_version(Elf_Shdr *sechdrs, if (strcmp(versions[i].name, symname) != 0) continue; - if (versions[i].crc == *crc) + if (versions[i].crc == maybe_relocated(*crc, crc_owner)) return 1; DEBUGP("Found checksum %lX vs module %lX\n", - *crc, versions[i].crc); + maybe_relocated(*crc, crc_owner), versions[i].crc); goto bad_version; } @@ -931,7 +943,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, &crc, true, false)) BUG(); - return check_version(sechdrs, versindex, "module_layout", mod, crc); + return check_version(sechdrs, versindex, "module_layout", mod, crc, + NULL); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -949,7 +962,8 @@ static inline int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, struct module *mod, - const unsigned long *crc) + const unsigned long *crc, + const struct module *crc_owner) { return 1; } @@ -984,8 +998,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, /* use_module can fail due to OOM, or module initialization or unloading */ if (sym) { - if (!check_version(sechdrs, versindex, name, mod, crc) || - !use_module(mod, owner)) + if (!check_version(sechdrs, versindex, name, mod, crc, owner) + || !use_module(mod, owner)) sym = NULL; } return sym; -- cgit v1.2.1 From e36c54582c6f14adc9e10473e2aec2cc4f0acc03 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 14 Dec 2009 15:58:33 -0500 Subject: tracing: Fix return of trace_dump_stack() The trace_dump_stack() returned a value for a void function. Also, added the missing stub for trace_dump_stack() when tracing is not configured. Reported-by: Ingo Molnar LKML-Reference: <20091214162713.GA31060@elte.hu> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bd7b969a729a..ee61915935d5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1158,7 +1158,7 @@ void trace_dump_stack(void) unsigned long flags; if (tracing_disabled || tracing_selftest_running) - return 0; + return; local_save_flags(flags); -- cgit v1.2.1 From b9f8fcd55bbdb037e5332dbdb7b494f0b70861ac Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 13 Dec 2009 18:25:02 -0800 Subject: sched: Fix cpu_clock() in NMIs, on !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK Relax stable-sched-clock architectures to not save/disable/restore hardirqs in cpu_clock(). The background is that I was trying to resolve a sparc64 perf issue when I discovered this problem. On sparc64 I implement pseudo NMIs by simply running the kernel at IRQ level 14 when local_irq_disable() is called, this allows performance counter events to still come in at IRQ level 15. This doesn't work if any code in an NMI handler does local_irq_save() or local_irq_disable() since the "disable" will kick us back to cpu IRQ level 14 thus letting NMIs back in and we recurse. The only path which that does that in the perf event IRQ handling path is the code supporting frequency based events. It uses cpu_clock(). cpu_clock() simply invokes sched_clock() with IRQs disabled. And that's a fundamental bug all on it's own, particularly for the HAVE_UNSTABLE_SCHED_CLOCK case. NMIs can thus get into the sched_clock() code interrupting the local IRQ disable code sections of it. Furthermore, for the not-HAVE_UNSTABLE_SCHED_CLOCK case, the IRQ disabling done by cpu_clock() is just pure overhead and completely unnecessary. So the core problem is that sched_clock() is not NMI safe, but we are invoking it from NMI contexts in the perf events code (via cpu_clock()). A less important issue is the overhead of IRQ disabling when it isn't necessary in cpu_clock(). CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures are not affected by this patch. Signed-off-by: David S. Miller Acked-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091213.182502.215092085.davem@davemloft.net> Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 479ce5682d7c..5b496132c28a 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(cpu); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) { - unsigned long long clock; - unsigned long flags; + return sched_clock_cpu(cpu); +} - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - return clock; -} EXPORT_SYMBOL_GPL(cpu_clock); -- cgit v1.2.1 From 0f624e7e5625f4c30c836b7a5decfe2553582391 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 15 Dec 2009 19:40:32 +1100 Subject: perf_event: Fix incorrect range check on cpu number It is quite legitimate for CPUs to be numbered sparsely, meaning that it possible for an online CPU to have a number which is greater than the total count of possible CPUs. Currently find_get_context() has a sanity check on the cpu number where it checks it against num_possible_cpus(). This test can fail for a legitimate cpu number if the cpu_possible_mask is sparsely populated. This fixes the problem by checking the CPU number against nr_cpumask_bits instead, since that is the appropriate check to ensure that the cpu number is same to pass to cpu_isset() subsequently. Reported-by: Michael Neuling Signed-off-by: Paul Mackerras Tested-by: Michael Neuling Acked-by: Peter Zijlstra Cc: LKML-Reference: <20091215084032.GA18661@brick.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d891ec4a8100..8823b0885183 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1603,7 +1603,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - if (cpu < 0 || cpu > num_possible_cpus()) + if (cpu < 0 || cpu >= nr_cpumask_bits) return ERR_PTR(-EINVAL); /* -- cgit v1.2.1 From 4b731d50ff3df6b9141a6c12b088e8eb0109e83c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 14 Dec 2009 17:57:34 -0800 Subject: bsdacct: fix uid/gid misreporting commit d8e180dcd5bbbab9cd3ff2e779efcf70692ef541 "bsdacct: switch credentials for writing to the accounting file" introduced credential switching during final acct data collecting. However, uid/gid pair continued to be collected from current which became credentials of who created acct file, not who exits. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=14676 Signed-off-by: Alexey Dobriyan Reported-by: Juho K. Juopperi Acked-by: Serge Hallyn Acked-by: David Howells Reviewed-by: Michal Schmidt Cc: James Morris Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 9a4715a2f6bf..a6605ca921b6 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, do_div(elapsed, AHZ); ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - current_uid_gid(&ac.ac_uid, &ac.ac_gid); + ac.ac_uid = orig_cred->uid; + ac.ac_gid = orig_cred->gid; #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif -- cgit v1.2.1 From 06808b0827e1cd14eedc96bac2655d5b37ac246c Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 14 Dec 2009 17:58:21 -0800 Subject: hugetlb: derive huge pages nodes allowed from task mempolicy This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes Signed-off-by: Lee Schermerhorn Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: KAMEZAWA Hiroyuki Cc: Randy Dunlap Cc: Nishanth Aravamudan Cc: Adam Litke Cc: Andy Whitcroft Cc: Eric Whitney Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 554ac4894f0f..60fc93131095 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1051,7 +1051,7 @@ static struct ctl_table vm_table[] = { .extra2 = &one_hundred, }, #ifdef CONFIG_HUGETLB_PAGE - { + { .procname = "nr_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), @@ -1059,7 +1059,18 @@ static struct ctl_table vm_table[] = { .proc_handler = hugetlb_sysctl_handler, .extra1 = (void *)&hugetlb_zero, .extra2 = (void *)&hugetlb_infinity, - }, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#endif { .procname = "hugetlb_shm_group", .data = &sysctl_hugetlb_shm_group, -- cgit v1.2.1 From 70da2340fbc68e91e701762f785479ab495a0869 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 14 Dec 2009 17:59:52 -0800 Subject: 'sysctl_max_map_count' should be non-negative Jan Engelhardt reported we have this problem: setting max_map_count to a value large enough results in programs dying at first try. This is on 2.6.31.6: 15:59 borg:/proc/sys/vm # echo $[1<<31-1] >max_map_count 15:59 borg:/proc/sys/vm # cat max_map_count 1073741824 15:59 borg:/proc/sys/vm # echo $[1<<31] >max_map_count 15:59 borg:/proc/sys/vm # cat max_map_count Killed This is because we have a chance to make 'max_map_count' negative. but it's meaningless. Make it only accept non-negative values. Reported-by: Jan Engelhardt Signed-off-by: WANG Cong Cc: Ingo Molnar Cc: Peter Zijlstra Cc: James Morris Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 60fc93131095..45e4bef0012a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1131,7 +1131,8 @@ static struct ctl_table vm_table[] = { .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec, + .extra1 = &zero, }, #else { -- cgit v1.2.1 From c0f68c2fab4898bcc4671a8fb941f428856b4ad5 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 14 Dec 2009 18:00:16 -0800 Subject: generic-ipi: cleanup for generic_smp_call_function_interrupt() Use smp_processor_id() instead of get_cpu() and put_cpu() in generic_smp_call_function_interrupt(), It's no need to disable preempt, because we must call generic_smp_call_function_interrupt() with interrupts disabled. Signed-off-by: Xiao Guangrong Acked-by: Ingo Molnar Cc: Jens Axboe Cc: Nick Piggin Cc: Peter Zijlstra Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index a8c76069cf50..00a1d0ede532 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) void generic_smp_call_function_interrupt(void) { struct call_function_data *data; - int cpu = get_cpu(); + int cpu = smp_processor_id(); /* * Shouldn't receive this interrupt on a cpu that is not yet online. @@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void) csd_unlock(&data->csd); } - put_cpu(); } /* -- cgit v1.2.1 From dfc6a736d452a8c308190b618b065c2257d370ff Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Mon, 14 Dec 2009 18:00:22 -0800 Subject: kernel/sys.c: fix "warning: do-while statement is not a compound statement" noise do_each_thread/while_each_thread wrap a block of code that is in this format: for (...) do ... while If curly braces do not surround the inner loop the following warning is generated by sparse: warning: do-while statement is not a compound statement Fix the warning by adding the braces. Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 585d6cd10040..20ccfb5da6af 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -189,10 +189,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread(g, p) { if (__task_cred(p)->uid == who) error = set_one_prio(p, niceval, error); - while_each_thread(g, p); + } while_each_thread(g, p); if (who != cred->uid) free_uid(user); /* For find_user() */ break; @@ -252,13 +252,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread(g, p) { if (__task_cred(p)->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } - while_each_thread(g, p); + } while_each_thread(g, p); if (who != cred->uid) free_uid(user); /* for find_user() */ break; -- cgit v1.2.1 From 5ada918b82399eef3afd6a71e3637697d6bd719f Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Mon, 14 Dec 2009 18:00:43 -0800 Subject: vt: introduce and use vt_kmsg_redirect() function The kernel offers with TIOCL_GETKMSGREDIRECT ioctl() the possibility to redirect the kernel messages to a specific console. However, since it's not possible to switch to the kernel message console after a panic(), it would be nice if the kernel would print the panic message on the current console. This patch series adds a new interface to access the global kmsg_redirect variable by a function to be able to use it in code where CONFIG_VT_CONSOLE is not set (kernel/panic.c). This patch: Instead of using and exporting a global value kmsg_redirect, introduce a function vt_kmsg_redirect() that both can set and return the console where messages are printed. Change all users of kmsg_redirect (the VT code itself and kernel/power.c) to the new interface. The main advantage is that vt_kmsg_redirect() can also be used when CONFIG_VT_CONSOLE is not set. Signed-off-by: Bernhard Walle Cc: Alan Cox Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/console.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index 5187136fe1de..218e5af90156 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include "power.h" @@ -21,8 +21,7 @@ int pm_prepare_console(void) if (orig_fgconsole < 0) return 1; - orig_kmsg = kmsg_redirect; - kmsg_redirect = SUSPEND_CONSOLE; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return 0; } @@ -30,7 +29,7 @@ void pm_restore_console(void) { if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); - kmsg_redirect = orig_kmsg; + vt_kmsg_redirect(orig_kmsg); } } #endif -- cgit v1.2.1 From e7d2860b690d4f3bed6824757c540579638e3d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= Date: Mon, 14 Dec 2009 18:01:06 -0800 Subject: tree-wide: convert open calls to remove spaces to skip_spaces() lib function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes use of skip_spaces() defined in lib/string.c for removing leading spaces from strings all over the tree. It decreases lib.a code size by 47 bytes and reuses the function tree-wide: text data bss dec hex filename 64688 584 592 65864 10148 (TOTALS-BEFORE) 64641 584 592 65817 10119 (TOTALS-AFTER) Also, while at it, if we see (*str && isspace(*str)), we can be sure to remove the first condition (*str) as the second one (isspace(*str)) also evaluates to 0 whenever *str == 0, making it redundant. In other words, "a char equals zero is never a space". Julia Lawall tried the semantic patch (http://coccinelle.lip6.fr) below, and found occurrences of this pattern on 3 more files: drivers/leds/led-class.c drivers/leds/ledtrig-timer.c drivers/video/output.c @@ expression str; @@ ( // ignore skip_spaces cases while (*str && isspace(*str)) { \(str++;\|++str;\) } | - *str && isspace(*str) ) Signed-off-by: André Goddard Rosa Cc: Julia Lawall Cc: Martin Schwidefsky Cc: Jeff Dike Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Richard Purdie Cc: Neil Brown Cc: Kyle McMartin Cc: Henrique de Moraes Holschuh Cc: David Howells Cc: Cc: Samuel Ortiz Cc: Patrick McHardy Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index d656c276508d..cf1b69183127 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -24,6 +24,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -122,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val) next = args + i; /* Chew up trailing spaces. */ - while (isspace(*next)) - next++; - return next; + return skip_spaces(next); } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ @@ -139,8 +138,7 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); /* Chew leading spaces */ - while (isspace(*args)) - args++; + args = skip_spaces(args); while (*args) { int ret; -- cgit v1.2.1 From f13c12c634e124d5d31f912b969d542a016d6105 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Dec 2009 19:43:11 +0100 Subject: perf_events: Fix perf_event_attr layout The miss-alignment of bp_addr created a 32bit hole, causing different structure packings on 32 and 64 bit machines. Fix that by moving __reserve_2 into that hole. Further, remove the useless struct and redundant __bp_reserve muck. Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <1260902591.8023.781.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8823b0885183..0dd8e5d02c66 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4564,7 +4564,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + if (attr->__reserved_1 || attr->__reserved_2) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) -- cgit v1.2.1 From f065f41f48569122b5bcddbd1ba2354f7cc29fdc Mon Sep 17 00:00:00 2001 From: Barry Song <21cnbao@gmail.com> Date: Tue, 15 Dec 2009 16:45:34 -0800 Subject: timecompare: fix half-Y2K38 problem in timecompare_update while calculating offset ktime will overflow from 03:14:07 UTC on Tuesday, 19 January 2038, ktime_add() in timecompare_update() will overflow a half earlier. As a result, wrong offset will be gotten, then cause some strange problems. Signed-off-by: Barry Song <21cnbao@gmail.com> Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Patrick Ohly Cc: David S. Miller Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timecompare.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index 96ff643a5a59..12f5c55090be 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync, * source time */ sample.offset = - ktime_to_ns(ktime_add(end, start)) / 2 - + (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - ts; /* simple insertion sort based on duration */ -- cgit v1.2.1 From 28dfef8febe48f59cf1e7596e1992a6a1893ca24 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 15 Dec 2009 16:46:48 -0800 Subject: const: constify remaining pipe_buf_operations Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 2 +- kernel/trace/trace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 760c26209a3c..c705a41b4ba3 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, relay_consume_bytes(rbuf, buf->private); } -static struct pipe_buf_operations relay_pipe_buf_ops = { +static const struct pipe_buf_operations relay_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb6b5e7fa2a2..31118ae16f03 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3133,7 +3133,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, __free_page(spd->pages[idx]); } -static struct pipe_buf_operations tracing_pipe_buf_ops = { +static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -3617,7 +3617,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, } /* Pipe buffer operations for a buffer. */ -static struct pipe_buf_operations buffer_pipe_buf_ops = { +static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, -- cgit v1.2.1 From 569b846df54ffb2827b83ce3244c5f032394cba4 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 15 Dec 2009 16:47:03 -0800 Subject: memcg: coalesce uncharge during unmap/truncate In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9bd91447e052..b6cbd33dde80 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + p->memcg_batch.do_batch = 0; + p->memcg_batch.memcg = NULL; +#endif p->bts = NULL; -- cgit v1.2.1 From 6580807da14c423f0d0a708108e6df6ebc8bc83d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:16 -0800 Subject: ptrace: copy_process() should disable stepping If the tracee calls fork() after PTRACE_SINGLESTEP, the forked child starts with TIF_SINGLESTEP/X86_EFLAGS_TF bits copied from ptraced parent. This is not right, especially when the new child is not auto-attaced: in this case it is killed by SIGTRAP. Change copy_process() to call user_disable_single_step(). Tested on x86. Test-case: #include #include #include #include #include #include int main(void) { int pid, status; if (!(pid = fork())) { assert(ptrace(PTRACE_TRACEME) == 0); kill(getpid(), SIGSTOP); if (!fork()) { /* kernel bug: this child will be killed by SIGTRAP */ printf("Hello world\n"); return 43; } wait(&status); return WEXITSTATUS(status); } for (;;) { assert(pid == wait(&status)); if (WIFEXITED(status)) break; assert(ptrace(PTRACE_SINGLESTEP, pid, 0,0) == 0); } assert(WEXITSTATUS(status) == 43); return 0; } Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b6cbd33dde80..202a0ba63d3c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1210,9 +1210,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->sas_ss_sp = p->sas_ss_size = 0; /* - * Syscall tracing should be turned off in the child regardless - * of CLONE_PTRACE. + * Syscall tracing and stepping should be turned off in the + * child regardless of CLONE_PTRACE. */ + user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); -- cgit v1.2.1 From 614c517d7c00af1b26ded20646b329397d6f51a1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:22 -0800 Subject: signals: SEND_SIG_NOINFO should be considered as SI_FROMUSER() No changes in compiled code. The patch adds the new helper, si_fromuser() and changes check_kill_permission() to use this helper. The real effect of this patch is that from now we "officially" consider SEND_SIG_NOINFO signal as "from user-space" signals. This is already true if we look at the code which uses SEND_SIG_NOINFO, except __send_signal() has another opinion - see the next patch. The naming of these special SEND_SIG_XXX siginfo's is really bad imho. From __send_signal()'s pov they mean SEND_SIG_NOINFO from user SEND_SIG_PRIV from kernel SEND_SIG_FORCED no info Signed-off-by: Oleg Nesterov Cc: Roland McGrath Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6b982f2cf524..a0ba428954b6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -607,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s) return 1; } +static inline int is_si_special(const struct siginfo *info) +{ + return info <= SEND_SIG_FORCED; +} + +static inline bool si_fromuser(const struct siginfo *info) +{ + return info == SEND_SIG_NOINFO || + (!is_si_special(info) && SI_FROMUSER(info)); +} + /* * Bad permissions for sending the signal * - the caller must hold at least the RCU read lock @@ -621,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return -EINVAL; - if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) + if (!si_fromuser(info)) return 0; error = audit_signal_info(sig, t); /* Let audit system see the signal */ @@ -1186,8 +1197,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, goto out_unlock; } pcred = __task_cred(p); - if ((info == SEND_SIG_NOINFO || - (!is_si_special(info) && SI_FROMUSER(info))) && + if (si_fromuser(info) && euid != pcred->suid && euid != pcred->uid && uid != pcred->suid && uid != pcred->uid) { ret = -EPERM; -- cgit v1.2.1 From dd34200adc01c5217ef09b55905b5c2312d65535 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:24 -0800 Subject: signals: send_signal: use si_fromuser() to detect from_ancestor_ns Change send_signal() to use si_fromuser(). From now SEND_SIG_NOINFO triggers the "from_ancestor_ns" check. This fixes reparent_thread()->group_send_sig_info(pdeath_signal) behaviour, before this patch send_signal() does not detect the cross-namespace case when the child of the dying parent belongs to the sub-namespace. This patch can affect the behaviour of send_sig(), kill_pgrp() and kill_pid() when the caller sends the signal to the sub-namespace with "priv == 0" but surprisingly all callers seem to use them correctly, including disassociate_ctty(on_exit). Except: drivers/staging/comedi/drivers/addi-data/*.c incorrectly use send_sig(priv => 0). But his is minor and should be fixed anyway. Reported-by: Daniel Lezcano Signed-off-by: Oleg Nesterov Cc: Roland McGrath Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a0ba428954b6..b65879d4e08f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -960,9 +960,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, int from_ancestor_ns = 0; #ifdef CONFIG_PID_NS - if (!is_si_special(info) && SI_FROMUSER(info) && - task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) - from_ancestor_ns = 1; + from_ancestor_ns = si_fromuser(info) && + !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif return __send_signal(sig, info, t, group, from_ancestor_ns); -- cgit v1.2.1 From 7486e5d9fc773cb67c29381567bed5236fc9573c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:24 -0800 Subject: signals: cosmetic, collect_signal: use SI_USER Trivial, s/0/SI_USER/ in collect_signal() for grep. This is a bit confusing, we don't know the source of this signal. But we don't care, and "info->si_code = 0" is imho worse. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b65879d4e08f..d7c7f3cd4da8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -423,7 +423,7 @@ still_pending: */ info->si_signo = sig; info->si_errno = 0; - info->si_code = 0; + info->si_code = SI_USER; info->si_pid = 0; info->si_uid = 0; } -- cgit v1.2.1 From ad09750b51150ca87531b8790a379214a974c167 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:25 -0800 Subject: signals: kill force_sig_specific() Kill force_sig_specific(), this trivial wrapper has no callers. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d7c7f3cd4da8..4a9d763f8922 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1062,12 +1062,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) return ret; } -void -force_sig_specific(int sig, struct task_struct *t) -{ - force_sig_info(sig, SEND_SIG_FORCED, t); -} - /* * Nuke all other threads in the group. */ -- cgit v1.2.1 From 1be53963b0519bd3681749a9bed8b83aeb005cca Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:26 -0800 Subject: signals: check ->group_stop_count after tracehook_get_signal() Move the call to do_signal_stop() down, after tracehook call. This makes ->group_stop_count condition visible to tracers before do_signal_stop() will participate in this group-stop. Currently the patch has no effect, tracehook_get_signal() always returns 0. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 4a9d763f8922..1814e68e4de3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1840,11 +1840,6 @@ relock: for (;;) { struct k_sigaction *ka; - - if (unlikely(signal->group_stop_count > 0) && - do_signal_stop(0)) - goto relock; - /* * Tracing can induce an artifical signal and choose sigaction. * The return value in @signr determines the default action, @@ -1856,6 +1851,10 @@ relock: if (unlikely(signr != 0)) ka = return_ka; else { + if (unlikely(signal->group_stop_count > 0) && + do_signal_stop(0)) + goto relock; + signr = dequeue_signal(current, ¤t->blocked, info); -- cgit v1.2.1 From 7be6d991bca63bbcdc5bc3b09789f367a3486537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= Date: Tue, 15 Dec 2009 16:47:39 -0800 Subject: pid: tighten pidmap spinlock critical section by removing kfree() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid calling kfree() under pidmap spinlock, calling it afterwards. Normally kfree() is fast, but sometimes it can be slow, so avoid calling it under the spinlock if we can do it. Signed-off-by: André Goddard Rosa Cc: Pekka Enberg Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index d3f722d20f9c..55fd5900bac8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) * installing it: */ spin_lock_irq(&pidmap_lock); - if (map->page) - kfree(page); - else + if (!map->page) { map->page = page; + page = NULL; + } spin_unlock_irq(&pidmap_lock); + kfree(page); if (unlikely(!map->page)) break; } -- cgit v1.2.1 From 417e315247ebc199d357855bb08d2a5264400565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= Date: Tue, 15 Dec 2009 16:47:40 -0800 Subject: pid: reduce code size by using a pointer to iterate over array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It decreases code size by 16 bytes on my gcc 4.4.1 on Core 2: text data bss dec hex filename 4314 2216 8 6538 198a kernel/pid.o-BEFORE 4298 2216 8 6522 197a kernel/pid.o-AFTER Signed-off-by: André Goddard Rosa Cc: Pekka Enberg Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 55fd5900bac8..2e17c9c92cbe 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -269,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for (i = ns->level; i >= 0; i--) { - upid = &pid->numbers[i]; + for ( ; upid >= pid->numbers; --upid) hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); - } spin_unlock_irq(&pidmap_lock); out: -- cgit v1.2.1 From 06a7f711246b081afc21fff859f1003f1f2a0fbc Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 15 Dec 2009 16:47:46 -0800 Subject: kexec: premit reduction of the reserved memory size Implement shrinking the reserved memory for crash kernel, if it is more than enough. For example, if you have already reserved 128M, now you just want 100M, you can do: # echo $((100*1024*1024)) > /sys/kernel/kexec_crash_size Note, you can only do this before loading the crash kernel. Signed-off-by: WANG Cong Cc: Neil Horman Acked-by: Eric W. Biederman Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/ksysfs.c | 21 ++++++++++++++++++++ 2 files changed, 80 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index f336e2107f98..433e9fcc1fc5 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -1082,6 +1083,64 @@ void crash_kexec(struct pt_regs *regs) } } +size_t crash_get_memory_size(void) +{ + size_t size; + mutex_lock(&kexec_mutex); + size = crashk_res.end - crashk_res.start + 1; + mutex_unlock(&kexec_mutex); + return size; +} + +static void free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long start, end; + + mutex_lock(&kexec_mutex); + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + start = crashk_res.start; + end = crashk_res.end; + + if (new_size >= end - start + 1) { + ret = -EINVAL; + if (new_size == end - start + 1) + ret = 0; + goto unlock; + } + + start = roundup(start, PAGE_SIZE); + end = roundup(start + new_size, PAGE_SIZE); + + free_reserved_phys_range(end, crashk_res.end); + + if (start == end) { + crashk_res.end = end; + release_resource(&crashk_res); + } else + crashk_res.end = end - 1; + +unlock: + mutex_unlock(&kexec_mutex); + return ret; +} + static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, size_t data_len) { diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 528dd78e7e7e..3feaf5a74514 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj, } KERNEL_ATTR_RO(kexec_crash_loaded); +static ssize_t kexec_crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%zu\n", crash_get_memory_size()); +} +static ssize_t kexec_crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (strict_strtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +KERNEL_ATTR_RW(kexec_crash_size); + static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_KEXEC &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, + &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif NULL -- cgit v1.2.1 From f4c4176f21533e22bcc292030da72bcfa105f5b8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 17:55:54 +0100 Subject: perf events: Allow per-task-per-cpu counters In order to allow for per-task-per-cpu counters, useful for scalability when profiling task hierarchies, we allow installing events with event->cpu != -1 in task contexts. __perf_event_sched_in() already skips events where ->cpu mis-matches the current cpu, fix up __perf_install_in_context() and __perf_event_enable() to also respect this filter. This does lead to vary hard to interpret enabled/running times for such counters, but I don't see a simple solution for that. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: fweisbec@gmail.com Cc: Paul Mackerras LKML-Reference: <20091216165904.831451147@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 0dd8e5d02c66..2e0aaa34fc7e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -782,6 +782,9 @@ static void __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); + if (event->cpu != -1 && event->cpu != smp_processor_id()) + goto unlock; + /* * Don't put the event on if it is disabled or if * it is in a group and the group isn't on. @@ -925,6 +928,9 @@ static void __perf_event_enable(void *info) goto unlock; __perf_event_mark_enabled(event, ctx); + if (event->cpu != -1 && event->cpu != smp_processor_id()) + goto unlock; + /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. @@ -1595,10 +1601,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) unsigned long flags; int err; - /* - * If cpu is not a wildcard then this is a percpu event: - */ - if (cpu != -1) { + if (pid == -1 && cpu != -1) { /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); -- cgit v1.2.1 From 9ee349ad6d326df3633d43f54202427295999c47 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Wed, 16 Dec 2009 18:04:32 +0100 Subject: sched: Fix set_cpu_active() in cpu_down() Sachin found cpu hotplug test failures on powerpc, which made the kernel hang on his POWER box. The problem is that we fail to re-activate a cpu when a hot-unplug fails. Fix this by moving the de-activation into _cpu_down after doing the initial checks. Remove the synchronize_sched() calls and rely on those implied by rebuilding the sched domains using the new mask. Reported-by: Sachin Sant Signed-off-by: Xiaotian Feng Tested-by: Sachin Sant Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.500272612@chello.nl> Signed-off-by: Ingo Molnar --- kernel/cpu.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 291ac586f37f..1c8ddd6ee940 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -209,6 +209,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -ENOMEM; cpu_hotplug_begin(); + set_cpu_active(cpu, false); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { @@ -280,18 +281,6 @@ int __ref cpu_down(unsigned int cpu) goto out; } - set_cpu_active(cpu, false); - - /* - * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_mask. - * This is not strictly necessary becuase stop_machine() - * that we run down the line already provides the required - * synchronization. But it's really a side effect and we do not - * want to depend on the innards of the stop_machine here. - */ - synchronize_sched(); - err = _cpu_down(cpu, 0); out: @@ -382,19 +371,12 @@ int disable_nonboot_cpus(void) return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); - /* We take down all of the non-boot CPUs in one shot to avoid races + /* + * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); - for_each_online_cpu(cpu) { - if (cpu == first_cpu) - continue; - set_cpu_active(cpu, false); - } - - synchronize_sched(); - printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) -- cgit v1.2.1 From e6c8fba7771563b2f3dfb96a78f36ec17e15bdf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:33 +0100 Subject: sched: Fix task_hot() test order Make sure not to access sched_fair fields before verifying it is indeed a sched_fair task. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith CC: stable@kernel.org LKML-Reference: <20091216170517.577998058@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9c30858b6463..1d8ca25dd6fb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2046,6 +2046,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + if (p->sched_class != &fair_sched_class) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2054,9 +2057,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) &p->se == cfs_rq_of(&p->se)->last)) return 1; - if (p->sched_class != &fair_sched_class) - return 0; - if (sysctl_sched_migration_cost == -1) return 1; if (sysctl_sched_migration_cost == 0) -- cgit v1.2.1 From e4f4288842ee12747e10c354d72be7d424c0b627 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:34 +0100 Subject: sched: Select_task_rq_fair() must honour SD_LOAD_BALANCE We should skip !SD_LOAD_BALANCE domains. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.653578430@chello.nl> CC: stable@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5bedf6e3ebf3..ec1d2715620c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1429,6 +1429,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) + continue; + /* * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. -- cgit v1.2.1 From 06b83b5fbea273672822b6ee93e16781046553ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:35 +0100 Subject: sched: Use TASK_WAKING for fork wakups For later convenience use TASK_WAKING for fresh tasks. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.732561278@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1d8ca25dd6fb..1672823aabfe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2540,14 +2540,6 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; } /* @@ -2558,6 +2550,12 @@ void sched_fork(struct task_struct *p, int clone_flags) int cpu = get_cpu(); __sched_fork(p); + /* + * We mark the process as waking here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_WAKING; /* * Revert to default priority/policy on fork if requested. @@ -2626,7 +2624,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) struct rq *rq; rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); + BUG_ON(p->state != TASK_WAKING); + p->state = TASK_RUNNING; update_rq_clock(rq); activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); @@ -6984,6 +6983,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); -- cgit v1.2.1 From e2912009fb7b715728311b0d8fe327a1432b3f79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:36 +0100 Subject: sched: Ensure set_task_cpu() is never called on blocked tasks In order to clean up the set_task_cpu() rq dependencies we need to ensure it is never called on blocked tasks because such usage does not pair with consistent rq->lock usage. This puts the migration burden on ttwu(). Furthermore we need to close a race against changing ->cpus_allowed, since select_task_rq() runs with only preemption disabled. For sched_fork() this is safe because the child isn't in the tasklist yet, for wakeup we fix this by synchronizing set_cpus_allowed_ptr() against TASK_WAKING, which leaves sched_exec to be a problem This also closes a hole in (6ad4c1888 sched: Fix balance vs hotplug race) where ->select_task_rq() doesn't validate the result against the sched_domain/root_domain. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 85 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1672823aabfe..33d7965f63f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2018,22 +2018,15 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - /* Must have done schedule() in kthread() before we set_task_cpu */ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - raw_spin_lock_irqsave(&rq->lock, flags); - update_rq_clock(rq); - set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; p->flags |= PF_THREAD_BOUND; - raw_spin_unlock_irqrestore(&rq->lock, flags); } EXPORT_SYMBOL(kthread_bind); @@ -2074,6 +2067,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING); +#endif + trace_sched_migrate_task(p, new_cpu); if (old_cpu != new_cpu) { @@ -2107,13 +2108,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. + * the next wake-up will properly place the task. */ - if (!p->se.on_rq && !task_running(rq, p)) { - update_rq_clock(rq); - set_task_cpu(p, dest_cpu); + if (!p->se.on_rq && !task_running(rq, p)) return 0; - } init_completion(&req->done); req->task = p; @@ -2319,10 +2317,42 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +/* + * Called from: + * + * - fork, @p is stable because it isn't on the tasklist yet + * + * - exec, @p is unstable XXX + * + * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so + * we should be good. + */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - return p->sched_class->select_task_rq(p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpu_active(cpu))) { + + cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + /* + * XXX: race against hot-plug modifying cpu_active_mask + */ + BUG_ON(cpu >= nr_cpu_ids); + } + + return cpu; } #endif @@ -7098,7 +7128,23 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; + /* + * Since we rely on wake-ups to migrate sleeping tasks, don't change + * the ->cpus_allowed mask from under waking tasks, which would be + * possible when we change rq->lock in ttwu(), so synchronize against + * TASK_WAKING to avoid that. + */ +again: + while (p->state == TASK_WAKING) + cpu_relax(); + rq = task_rq_lock(p, &flags); + + if (p->state == TASK_WAKING) { + task_rq_unlock(rq, &flags); + goto again; + } + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -7154,7 +7200,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; + int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; @@ -7170,12 +7216,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; - on_rq = p->se.on_rq; - if (on_rq) + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->se.on_rq) { deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { + set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } -- cgit v1.2.1 From 3802290628348674985d14914f9bfee7b9084548 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:37 +0100 Subject: sched: Fix sched_exec() balancing Since we access ->cpus_allowed without holding rq->lock we need a retry loop to validate the result, this comes for near free when we merge sched_migrate_task() into sched_exec() since that already does the needed check. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.884743662@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 33d7965f63f0..63e55ac242d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2322,7 +2322,7 @@ void task_oncpu_function_call(struct task_struct *p, * * - fork, @p is stable because it isn't on the tasklist yet * - * - exec, @p is unstable XXX + * - exec, @p is unstable, retry loop * * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so * we should be good. @@ -3132,21 +3132,36 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +void sched_exec(void) { + struct task_struct *p = current; struct migration_req req; + int dest_cpu, this_cpu; unsigned long flags; struct rq *rq; +again: + this_cpu = get_cpu(); + dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0); + if (dest_cpu == this_cpu) { + put_cpu(); + return; + } + rq = task_rq_lock(p, &flags); + put_cpu(); + + /* + * select_task_rq() can race against ->cpus_allowed + */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; + || unlikely(!cpu_active(dest_cpu))) { + task_rq_unlock(rq, &flags); + goto again; + } /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { @@ -3161,23 +3176,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) return; } -out: task_rq_unlock(rq, &flags); } -/* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. -- cgit v1.2.1 From 5da9a0fb673a0ea0a093862f95f6b89b3390c31e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:38 +0100 Subject: sched: Fix select_task_rq() vs hotplug issues Since select_task_rq() is now responsible for guaranteeing ->cpus_allowed and cpu_active_mask, we need to verify this. select_task_rq_rt() can blindly return smp_processor_id()/task_cpu() without checking the valid masks, select_task_rq_fair() can do the same in the rare case that all SD_flags are disabled. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.961475466@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 75 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 63e55ac242d1..cc40bdadee7a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2317,6 +2317,43 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + rcu_read_lock(); + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + rcu_read_unlock(); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } + + return dest_cpu; +} + /* * Called from: * @@ -2343,14 +2380,8 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_active(cpu))) { - - cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - /* - * XXX: race against hot-plug modifying cpu_active_mask - */ - BUG_ON(cpu >= nr_cpu_ids); - } + !cpu_active(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); return cpu; } @@ -7319,36 +7350,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); again: - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto move; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - goto move; - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - pr_info("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } + dest_cpu = select_fallback_rq(dead_cpu, p); -move: /* It can have affinity changed while we were choosing. */ if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) goto again; -- cgit v1.2.1 From 881232b70b195768a71cd74ff4b4e8ab9502997b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:39 +0100 Subject: sched: Move kthread_bind() back to kthread.c Since kthread_bind() lost its dependencies on sched.c, move it back where it came from. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.039524041@chello.nl> Signed-off-by: Ingo Molnar --- kernel/kthread.c | 23 +++++++++++++++++++++++ kernel/sched.c | 26 -------------------------- 2 files changed, 23 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index ab7ae57773e1..fbb6222fe7e0 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -149,6 +149,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; +} +EXPORT_SYMBOL(kthread_bind); + /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). diff --git a/kernel/sched.c b/kernel/sched.c index cc40bdadee7a..297dc441ff96 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2004,32 +2004,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } -/** - * kthread_bind - bind a just-created kthread to a cpu. - * @p: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - * - * Function lives here instead of kthread.c because it messes with - * scheduler internals which require locking. - */ -void kthread_bind(struct task_struct *p, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - - p->cpus_allowed = cpumask_of_cpu(cpu); - p->rt.nr_cpus_allowed = 1; - p->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - #ifdef CONFIG_SMP /* * Is this task likely cache-hot: -- cgit v1.2.1 From efbbd05a595343a413964ad85a2ad359b7b7efbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:40 +0100 Subject: sched: Add pre and post wakeup hooks As will be apparent in the next patch, we need a pre wakeup hook for sched_fair task migration, hence rename the post wakeup hook and one pre wakeup. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.114746117@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++---- kernel/sched_rt.c | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 297dc441ff96..6c571bdd5658 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2412,6 +2412,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); @@ -2475,8 +2479,8 @@ out_running: p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); if (unlikely(rq->idle_stamp)) { u64 delta = rq->clock - rq->idle_stamp; @@ -2666,8 +2670,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d2ea2828164e..f48328ac216f 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq) * If we are not running and we are not going to reschedule soon, we should * try to push tasks away now */ -static void task_wake_up_rt(struct rq *rq, struct task_struct *p) +static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && @@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = { .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, - .task_wake_up = task_wake_up_rt, + .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif -- cgit v1.2.1 From 88ec22d3edb72b261f8628226cd543589a6d5e1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:41 +0100 Subject: sched: Remove the cfs_rq dependency from set_task_cpu() In order to remove the cfs_rq dependency from set_task_cpu() we need to ensure the task is cfs_rq invariant for all callsites. The simple approach is to substract cfs_rq->min_vruntime from se->vruntime on dequeue, and add cfs_rq->min_vruntime on enqueue. However, this has the downside of breaking FAIR_SLEEPERS since we loose the old vruntime as we only maintain the relative position. To solve this, we observe that we only migrate runnable tasks, we do this using deactivate_task(.sleep=0) and activate_task(.wakeup=0), therefore we can restrain the min_vruntime invariance to that state. The only other case is wakeup balancing, since we want to maintain the old vruntime we cannot make it relative on dequeue, but since we don't migrate inactive tasks, we can do so right before we activate it again. This is where we need the new pre-wakeup hook, we need to call this while still holding the old rq->lock. We could fold it into ->select_task_rq(), but since that has multiple callsites and would obfuscate the locking requirements, that seems like a fudge. This leaves the fork() case, simply make sure that ->task_fork() leaves the ->vruntime in a relative state. This covers all cases where set_task_cpu() gets called, and ensures it sees a relative vruntime. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.191697025@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +----- kernel/sched_fair.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c571bdd5658..f92ce63edfff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2038,8 +2038,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); #ifdef CONFIG_SCHED_DEBUG /* @@ -2056,8 +2054,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -10102,7 +10098,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec1d2715620c..42ac3c9f66f6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); + curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } @@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_MIGRATE 2 + static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + se->vruntime += cfs_rq->min_vruntime; + /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); - if (wakeup) { + if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!sleep) + se->vruntime -= cfs_rq->min_vruntime; } /* @@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int flags = 0; + + if (wakeup) + flags |= ENQUEUE_WAKEUP; + if (p->state == TASK_WAKING) + flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); - wakeup = 1; + enqueue_entity(cfs_rq, se, flags); + flags = ENQUEUE_WAKEUP; } hrtick_update(rq); @@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *rq) #ifdef CONFIG_SMP +static void task_waking_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + se->vruntime -= cfs_rq->min_vruntime; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -1978,6 +2011,8 @@ static void task_fork_fair(struct task_struct *p) resched_task(rq->curr); } + se->vruntime -= cfs_rq->min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2031,12 +2066,13 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p) +static void moved_group_fair(struct task_struct *p, int on_rq) { struct cfs_rq *cfs_rq = task_cfs_rq(p); update_curr(cfs_rq); - place_entity(cfs_rq, &p->se, 1); + if (!on_rq) + place_entity(cfs_rq, &p->se, 1); } #endif @@ -2076,6 +2112,8 @@ static const struct sched_class fair_sched_class = { .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, -- cgit v1.2.1 From 738d2be4301007f054541c5c4bf7fb6a361c9b3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:42 +0100 Subject: sched: Simplify set_task_cpu() Rearrange code a bit now that its a simpler function. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.269101883@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f92ce63edfff..8a2bfd37ab4f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2034,11 +2034,8 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } - void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { - int old_cpu = task_cpu(p); - #ifdef CONFIG_SCHED_DEBUG /* * We should never call set_task_cpu() on a blocked task, @@ -2049,11 +2046,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); - if (old_cpu != new_cpu) { - p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); - } + if (task_cpu(p) == new_cpu) + return; + + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); __set_task_cpu(p, new_cpu); } -- cgit v1.2.1 From 61cf693159d6a968a7014e24905143f71ed8ddcf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:28:44 +0100 Subject: [sysctl] Fix breakage on systems with older glibc As predicted during code review, the sysctl(2) changes made systems with old glibc nearly unusable. About every command gives a: warning: process `ls' used the deprecated sysctl system call with 1.4 warning in the log. I see this on a SUSE 10.0 system with glibc 2.3.5. Don't warn for this common case. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b75dbf40f573..112533d5fc08 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1399,6 +1399,13 @@ static void deprecated_sysctl_warning(const int *name, int nlen) { int i; + /* + * CTL_KERN/KERN_VERSION is used by older glibc and cannot + * ever go away. + */ + if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + return; + if (printk_ratelimit()) { printk(KERN_INFO "warning: process `%s' used the deprecated sysctl " -- cgit v1.2.1 From 6e1415467614e854fee660ff6648bd10fa976e95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 15 Dec 2009 19:27:45 +0000 Subject: NOMMU: Optimise away the {dac_,}mmap_min_addr tests In NOMMU mode clamp dac_mmap_min_addr to zero to cause the tests on it to be skipped by the compiler. We do this as the minimum mmap address doesn't make any sense in NOMMU mode. mmap_min_addr and round_hint_to_min() can be discarded entirely in NOMMU mode. Signed-off-by: David Howells Acked-by: Eric Paris Signed-off-by: James Morris --- kernel/sysctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45e4bef0012a..856a24eadf7e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1214,6 +1214,7 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif +#ifdef CONFIG_MMU { .procname = "mmap_min_addr", .data = &dac_mmap_min_addr, @@ -1221,6 +1222,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = mmap_min_addr_handler, }, +#endif #ifdef CONFIG_NUMA { .procname = "numa_zonelist_order", -- cgit v1.2.1 From cf1e367ee84e02ac349ad0858eb65e8a6a511c8b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 17 Dec 2009 11:15:42 +1100 Subject: timers: Remove duplicate setting of new_base in __mod_timer() new_base is set using per_cpu(tvec_bases, cpu) after selecting the desired value of cpu immediately below so this line is a unnecessary. Signed-off-by: Simon Horman LKML-Reference: <20091217001542.GD25317@verge.net.au> Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 5db5a8d26811..15533b792397 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = __get_cpu_var(tvec_bases); - cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) -- cgit v1.2.1 From f6325e30ebd6fc870315b017a5d4a6ab15bf790b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 17 Dec 2009 11:43:08 -0600 Subject: cpumask: use cpu_online in kernel/perf_event.c Also, we want to check against nr_cpu_ids, not num_possible_cpus(). The latter works, but the correct bounds check is < nr_cpu_ids. Signed-off-by: Rusty Russell To: Thomas Gleixner --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8ab86988bd24..97d1a3dd7a59 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1614,7 +1614,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) * offline CPU and activate it when the CPU comes up, but * that's for later. */ - if (!cpu_isset(cpu, cpu_online_map)) + if (!cpu_online(cpu)) return ERR_PTR(-ENODEV); cpuctx = &per_cpu(perf_cpu_context, cpu); -- cgit v1.2.1 From 62ac12795095dc959649c66ace78708e7ac52477 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 17 Dec 2009 11:43:26 -0600 Subject: cpumask: avoid dereferencing struct cpumask struct cpumask will be undefined soon with CONFIG_CPUMASK_OFFSTACK=y, to avoid them being declared on the stack. cpumask_bits() does what we want here (of course, this code is crap). Signed-off-by: Rusty Russell To: Thomas Gleixner --- kernel/time/timer_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 28265636b6c2..bdfb8dd1050c 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -237,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m) #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - tick_get_broadcast_mask()->bits[0]); + cpumask_bits(tick_get_broadcast_mask())[0]); #ifdef CONFIG_TICK_ONESHOT SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - tick_get_broadcast_oneshot_mask()->bits[0]); + cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); #endif SEQ_printf(m, "\n"); #endif -- cgit v1.2.1 From 416eb39556a03d1c7e52b0791e9052ccd71db241 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 17 Dec 2009 06:05:49 +0100 Subject: sched: Make warning less noisy Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8a2bfd37ab4f..af7dfa74e6bb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2041,7 +2041,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v1.2.1 From 6f3cf440470650b3841d325acacd0c5ea9504c68 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 16 Dec 2009 17:24:08 -0500 Subject: kprobe-tracer: Check new event/group name Check new event/group name is same syntax as a C symbol. In other words, checking the name is as like as other tracepoint events. This can prevent user to create an event with useless name (e.g. foo|bar, foo*bar). Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: systemtap Cc: DLE LKML-Reference: <20091216222408.14459.68790.stgit@dhcp-100-2-132.bos.redhat.com> [ v2: minor cleanups ] Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ecab06547a5..375f81a568dc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -282,6 +282,18 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); +/* Check the name is good for event/group */ +static int check_event_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return 0; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return 0; + } + return 1; +} + /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -293,10 +305,11 @@ static struct trace_probe *alloc_trace_probe(const char *group, int nargs, int is_return) { struct trace_probe *tp; + int ret = -ENOMEM; tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); if (!tp) - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); if (symbol) { tp->symbol = kstrdup(symbol, GFP_KERNEL); @@ -312,14 +325,20 @@ static struct trace_probe *alloc_trace_probe(const char *group, else tp->rp.kp.pre_handler = kprobe_dispatcher; - if (!event) + if (!event || !check_event_name(event)) { + ret = -EINVAL; goto error; + } + tp->call.name = kstrdup(event, GFP_KERNEL); if (!tp->call.name) goto error; - if (!group) + if (!group || !check_event_name(group)) { + ret = -EINVAL; goto error; + } + tp->call.system = kstrdup(group, GFP_KERNEL); if (!tp->call.system) goto error; @@ -330,7 +349,7 @@ error: kfree(tp->call.name); kfree(tp->symbol); kfree(tp); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } static void free_probe_arg(struct probe_arg *arg) @@ -695,10 +714,10 @@ static int create_trace_probe(int argc, char **argv) if (!event) { /* Make a new event name */ if (symbol) - snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", is_return ? 'r' : 'p', symbol, offset); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); event = buf; } -- cgit v1.2.1 From 234da7bcdc7aaa935846534c3b726dbc79a9cdd5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Dec 2009 20:21:05 +0100 Subject: sched: Teach might_sleep() about preemptible RCU In practice, it is harmless to voluntarily sleep in a rcu_read_lock() section if we are running under preempt rcu, but it is illegal if we build a kernel running non-preemptable rcu. Currently, might_sleep() doesn't notice sleepable operations under rcu_read_lock() sections if we are running under preemptable rcu because preempt_count() is left untouched after rcu_read_lock() in this case. But we want developers who test their changes under such config to notice the "sleeping while atomic" issues. So we add rcu_read_lock_nesting to prempt_count() in might_sleep() checks. [ v2: Handle rcu-tiny ] Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Cc: Peter Zijlstra LKML-Reference: <1260991265-8451-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index af7dfa74e6bb..7be88a7be047 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9682,7 +9682,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() & ~PREEMPT_ACTIVE; + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -- cgit v1.2.1 From 61c1917f47f73c968e92d04d15370b1dc3ec4592 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Dec 2009 05:40:33 +0100 Subject: perf events, x86/stacktrace: Make stack walking optional The current print_context_stack helper that does the stack walking job is good for usual stacktraces as it walks through all the stack and reports even addresses that look unreliable, which is nice when we don't have frame pointers for example. But we have users like perf that only require reliable stacktraces, and those may want a more adapted stack walker, so lets make this function a callback in stacktrace_ops that users can tune for their needs. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1261024834-5336-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_sysprof.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index f6693969287d..a7974a552ca9 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = { .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, + .walk_stack = print_context_stack, }; static int -- cgit v1.2.1 From 5d27c23df09b702868d9a3bff86ec6abd22963ac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:32 +0100 Subject: perf events: Dont report side-band events on each cpu for per-task-per-cpu events Acme noticed that his FORK/MMAP numbers were inflated by about the same factor as his cpu-count. This led to the discovery of a few more sites that need to respect the event->cpu filter. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20091217121830.215333434@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8ab86988bd24..03cc061398d1 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1381,6 +1381,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (event->state != PERF_EVENT_STATE_ACTIVE) continue; + if (event->cpu != -1 && event->cpu != smp_processor_id()) + continue; + hwc = &event->hw; interrupts = hwc->interrupts; @@ -3265,6 +3268,9 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { + if (event->cpu != -1 && event->cpu != smp_processor_id()) + return 0; + if (event->attr.comm || event->attr.mmap || event->attr.task) return 1; @@ -3290,12 +3296,11 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); - put_cpu_var(perf_cpu_context); - if (!ctx) ctx = rcu_dereference(task_event->task->perf_event_ctxp); if (ctx) perf_event_task_ctx(ctx, task_event); + put_cpu_var(perf_cpu_context); rcu_read_unlock(); } @@ -3372,6 +3377,9 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { + if (event->cpu != -1 && event->cpu != smp_processor_id()) + return 0; + if (event->attr.comm) return 1; @@ -3408,15 +3416,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); - put_cpu_var(perf_cpu_context); - - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_comm_ctx(ctx, comm_event); + put_cpu_var(perf_cpu_context); rcu_read_unlock(); } @@ -3491,6 +3494,9 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { + if (event->cpu != -1 && event->cpu != smp_processor_id()) + return 0; + if (event->attr.mmap) return 1; @@ -3564,15 +3570,10 @@ got_name: rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); - put_cpu_var(perf_cpu_context); - - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_mmap_ctx(ctx, mmap_event); + put_cpu_var(perf_cpu_context); rcu_read_unlock(); kfree(buf); @@ -3863,6 +3864,9 @@ static int perf_swevent_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + if (event->cpu != -1 && event->cpu != smp_processor_id()) + return 0; + if (!perf_swevent_is_counting(event)) return 0; -- cgit v1.2.1 From 077614ee1e93245a3b9a4e1213659405dbeb0ba6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:31 +0100 Subject: sched: Fix broken assertion There's a preemption race in the set_task_cpu() debug check in that when we get preempted after setting task->state we'd still be on the rq proper, but fail the test. Check for preempted tasks, since those are always on the RQ. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121830.137155561@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7be88a7be047..720df108a2d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2041,7 +2041,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v1.2.1 From 3e26120cc7c819c97bc07281ca1fb9017cfe9a39 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 17 Dec 2009 15:27:05 -0800 Subject: kernel/sysctl.c: fix the incomplete part of sysctl_max_map_count-should-be-non-negative.patch It is a mistake that we used 'proc_dointvec', it should be 'proc_dointvec_minmax', as in the original patch. Signed-off-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45e4bef0012a..6665761c006d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1131,7 +1131,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, #else -- cgit v1.2.1 From 9cd80bbb07fcd6d4d037fad4297496d3b132ac6b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 17 Dec 2009 15:27:15 -0800 Subject: do_wait() optimization: do not place sub-threads on task_struct->children list Thanks to Roland who pointed out de_thread() issues. Currently we add sub-threads to ->real_parent->children list. This buys nothing but slows down do_wait(). With this patch ->children contains only main threads (group leaders). The only complication is that forget_original_parent() should iterate over sub-threads by hand, and de_thread() needs another list_replace() when it changes ->group_leader. Henceforth do_wait_thread() can never see task_detached() && !EXIT_DEAD tasks, we can remove this check (and we can unify do_wait_thread() and ptrace_do_wait()). This change can confuse the optimistic search in mm_update_next_owner(), but this is fixable and minor. Perhaps badness() and oom_kill_process() should be updated, but they should be fixed in any case. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Cc: Ratan Nalumasu Cc: Vitaly Mayatskikh Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 36 +++++++++++++++++------------------- kernel/fork.c | 2 +- 2 files changed, 18 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 5962d7ccf243..546774a31a66 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,10 +68,10 @@ static void __unhash_process(struct task_struct *p) detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); + list_del_init(&p->sibling); __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - list_del_init(&p->sibling); } /* @@ -736,12 +736,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father) /* * Any that need to be release_task'd are put on the @dead list. */ -static void reparent_thread(struct task_struct *father, struct task_struct *p, +static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - if (p->pdeath_signal) - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); - list_move_tail(&p->sibling, &p->real_parent->children); if (task_detached(p)) @@ -780,12 +777,18 @@ static void forget_original_parent(struct task_struct *father) reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { - p->real_parent = reaper; - if (p->parent == father) { - BUG_ON(task_ptrace(p)); - p->parent = p->real_parent; - } - reparent_thread(father, p, &dead_children); + struct task_struct *t = p; + do { + t->real_parent = reaper; + if (t->parent == father) { + BUG_ON(task_ptrace(t)); + t->parent = t->real_parent; + } + if (t->pdeath_signal) + group_send_sig_info(t->pdeath_signal, + SEND_SIG_NOINFO, t); + } while_each_thread(p, t); + reparent_leader(father, p, &dead_children); } write_unlock_irq(&tasklist_lock); @@ -1551,14 +1554,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { - /* - * Do not consider detached threads. - */ - if (!task_detached(p)) { - int ret = wait_consider_task(wo, 0, p); - if (ret) - return ret; - } + int ret = wait_consider_task(wo, 0, p); + if (ret) + return ret; } return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 202a0ba63d3c..5b2959b3ffc2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1291,7 +1291,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - list_add_tail(&p->sibling, &p->real_parent->children); tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { @@ -1303,6 +1302,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); + list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } -- cgit v1.2.1 From 6485536bcf499839a54dcda8a8d47ea0bd29b375 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 17 Dec 2009 15:27:27 -0800 Subject: printk: fix new kernel-doc warnings Fix kernel-doc warnings in printk.c: Warning(kernel/printk.c:1422): No description found for parameter 'dumper' Warning(kernel/printk.c:1422): Excess function parameter 'dump' description in 'kmsg_dump_register' Warning(kernel/printk.c:1451): No description found for parameter 'dumper' Warning(kernel/printk.c:1451): Excess function parameter 'dump' description in 'kmsg_dump_unregister' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 1ded8e7dd19b..17463ca2e229 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1412,7 +1412,7 @@ static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. - * @dump: pointer to the kmsg_dumper structure + * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be @@ -1442,7 +1442,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. - * @dump: pointer to the kmsg_dumper structure + * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. -- cgit v1.2.1 From 6f5d51148921c242680a7a1d9913384a30ab3cbe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 15:59:45 +0000 Subject: fix braindamage in audit_tree.c untag_chunk() ... aka "Al had badly fscked up when writing that thing and nobody noticed until Eric had fixed leaks that used to mask the breakage". The function essentially creates a copy of old array sans one element and replaces the references to elements of original (they are on cyclic lists) with those to corresponding elements of new one. After that the old one is fair game for freeing. First of all, there's a dumb braino: when we get to list_replace_init we use indices for wrong arrays - position in new one with the old array and vice versa. Another bug is more subtle - termination condition is wrong if the element to be excluded happens to be the last one. We shouldn't go until we fill the new array, we should go until we'd finished the old one. Otherwise the element we are trying to kill will remain on the cyclic lists... That crap used to be masked by several leaks, so it was not quite trivial to hit. Eric had fixed some of those leaks a while ago and the shit had hit the fan... Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2451dc6f3282..b36aa9651ba2 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -277,7 +277,7 @@ static void untag_chunk(struct node *p) owner->root = NULL; } - for (i = j = 0; i < size; i++, j++) { + for (i = j = 0; j <= size; i++, j++) { struct audit_tree *s; if (&chunk->owners[j] == p) { list_del_init(&p->list); @@ -290,7 +290,7 @@ static void untag_chunk(struct node *p) if (!s) /* result of earlier fallback */ continue; get_tree(s); - list_replace_init(&chunk->owners[i].list, &new->owners[j].list); + list_replace_init(&chunk->owners[j].list, &new->owners[i].list); } list_replace_rcu(&chunk->hash, &new->hash); -- cgit v1.2.1 From b4c30aad39805902cf5b855aa8a8b22d728ad057 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 16:03:30 +0000 Subject: fix more leaks in audit_tree.c tag_chunk() Several leaks in audit_tree didn't get caught by commit 318b6d3d7ddbcad3d6867e630711b8a705d873d7, including the leak on normal exit in case of multiple rules refering to the same chunk. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b36aa9651ba2..4b05bd9479db 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - put_inotify_watch(watch); + put_inotify_watch(&old->watch); return 0; } } spin_unlock(&hash_lock); chunk = alloc_chunk(old->count + 1); - if (!chunk) + if (!chunk) { + put_inotify_watch(&old->watch); return -ENOMEM; + } mutex_lock(&inode->inotify_mutex); if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { @@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); inotify_evict_watch(&old->watch); mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); + put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ + put_inotify_watch(&old->watch); /* and kill it */ return 0; } -- cgit v1.2.1 From 3df0fc5b2e9d8092dcaeb5ae0b6753d85c851d66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Dec 2009 14:23:57 +0100 Subject: sched: Restore printk sanity Revert the braindead pr_* crap. (Commit 663997d "sched: Use pr_fmt() and pr_()") It's dumb and causes stupid "sched: " strings all over the place. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Cc: Joe Perches Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <1261315437.4314.6.camel@laptop> [ i dont mind the pr_*() patterns that much - but Peter dislikes them with a vengence. ] [ - v2: remove spurious diffstat from changelog :-/ ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 89 +++++++++++++++++++++++++++---------------------- kernel/sched_idletask.c | 2 +- 2 files changed, 50 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 720df108a2d6..7ffde2ae7868 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -26,8 +26,6 @@ * Thomas Gleixner, Mike Kravetz */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include @@ -5375,8 +5373,8 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); print_modules(); @@ -6940,23 +6938,23 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - pr_info("%-13.13s %c", p->comm, + printk(KERN_INFO "%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - pr_cont(" running "); + printk(KERN_CONT " running "); else - pr_cont(" %08lx ", thread_saved_pc(p)); + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - pr_cont(" running task "); + printk(KERN_CONT " running task "); else - pr_cont(" %016lx ", thread_saved_pc(p)); + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - pr_cont("%5lu %5d %6d 0x%08lx\n", free, + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), task_pid_nr(p->real_parent), (unsigned long)task_thread_info(p)->flags); @@ -6968,9 +6966,11 @@ void show_state_filter(unsigned long state_filter) struct task_struct *g, *p; #if BITS_PER_LONG == 32 - pr_info(" task PC stack pid father\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #else - pr_info(" task PC stack pid father\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -7828,44 +7828,48 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_DEBUG "%*s domain %d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { - pr_cont("does not load-balance\n"); + printk("does not load-balance\n"); if (sd->parent) - pr_err("ERROR: !SD_LOAD_BALANCE domain has parent\n"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); return -1; } - pr_cont("span %s level %s\n", str, sd->name); + printk(KERN_CONT "span %s level %s\n", str, sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - pr_err("ERROR: domain->span does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - pr_err("ERROR: domain->groups does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { - pr_cont("\n"); - pr_err("ERROR: group is NULL\n"); + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); break; } if (!group->cpu_power) { - pr_cont("\n"); - pr_err("ERROR: domain->cpu_power not set\n"); + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); break; } if (!cpumask_weight(sched_group_cpus(group))) { - pr_cont("\n"); - pr_err("ERROR: empty group\n"); + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); break; } if (cpumask_intersects(groupmask, sched_group_cpus(group))) { - pr_cont("\n"); - pr_err("ERROR: repeated CPUs\n"); + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } @@ -7873,21 +7877,23 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - pr_cont(" %s", str); + printk(KERN_CONT " %s", str); if (group->cpu_power != SCHED_LOAD_SCALE) { - pr_cont(" (cpu_power = %d)", group->cpu_power); + printk(KERN_CONT " (cpu_power = %d)", + group->cpu_power); } group = group->next; } while (group != sd->groups); - pr_cont("\n"); + printk(KERN_CONT "\n"); if (!cpumask_equal(sched_domain_span(sd), groupmask)) - pr_err("ERROR: groups don't span domain->span\n"); + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - pr_err("ERROR: parent span is not a superset of domain->span\n"); + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); return 0; } @@ -8443,7 +8449,8 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - pr_warning("Can not alloc domain group for node %d\n", num); + printk(KERN_WARNING "Can not alloc domain group for node %d\n", + num); return -ENOMEM; } d->sched_group_nodes[num] = sg; @@ -8472,8 +8479,8 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - pr_warning("Can not alloc domain group for node %d\n", - j); + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); return -ENOMEM; } sg->cpu_power = 0; @@ -8701,7 +8708,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, d->sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!d->sched_group_nodes) { - pr_warning("Can not alloc sched group node list\n"); + printk(KERN_WARNING "Can not alloc sched group node list\n"); return sa_notcovered; } sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; @@ -8718,7 +8725,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_send_covered; d->rd = alloc_rootdomain(); if (!d->rd) { - pr_warning("Cannot alloc root domain\n"); + printk(KERN_WARNING "Cannot alloc root domain\n"); return sa_tmpmask; } return sa_rootdomain; @@ -9700,11 +9707,13 @@ void __might_sleep(char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; - pr_err("BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - pr_err("in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 21b969a28725..5f93b570d383 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -35,7 +35,7 @@ static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { raw_spin_unlock_irq(&rq->lock); - pr_err("bad: scheduling from the idle thread!\n"); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); raw_spin_lock_irq(&rq->lock); } -- cgit v1.2.1 From 70f1120527797adb31c68bdc6f1b45e182c342c7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Dec 2009 17:36:27 +0100 Subject: sched: Fix hotplug hang The hot-unplug kstopmachine usage does a wakeup after deactivating the cpu, hence we cannot use cpu_active() here but must rely on the good olde online. Reported-by: Sachin Sant Reported-by: Jens Axboe Signed-off-by: Peter Zijlstra Tested-by: Jens Axboe Cc: Heiko Carstens Cc: Benjamin Herrenschmidt LKML-Reference: <1261326987.4314.24.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7ffde2ae7868..87f1f47beffe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2346,7 +2346,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_active(cpu))) + !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); return cpu; -- cgit v1.2.1 From 0e2c8b8f55072a98b99e7bdad55c912084d6a526 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 20 Dec 2009 10:50:02 +0100 Subject: resources: fix call to alignf() in allocate_resource() The second parameter to alignf() in allocate_resource() must reflect what new resource is attempted to be allocated, else functions like pcibios_align_resource() (at least on x86) or pcmcia_align() can't work correctly. Commit 1e5ad9679016275d422e36b12a98b0927d76f556 broke this by setting the "new" resource until we're about to return success. To keep the resource untouched when allocate_resource() fails, a "tmp" resource is introduced. Signed-off-by: Dominik Brodowski Acked-by: Bjorn Helgaas Cc: Yinghai Lu Cc: Jesse Barnes Signed-off-by: Linus Torvalds --- kernel/resource.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index dc15686b7a77..af96c1e4b54b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -308,37 +308,37 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - resource_size_t start, end; + struct resource tmp = *new; - start = root->start; + tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to new->end below would cause an underflow. + * of this->start - 1 to tmp->end below would cause an underflow. */ if (this && this->start == 0) { - start = this->end + 1; + tmp.start = this->end + 1; this = this->sibling; } for(;;) { if (this) - end = this->start - 1; + tmp.end = this->start - 1; else - end = root->end; - if (start < min) - start = min; - if (end > max) - end = max; - start = ALIGN(start, align); + tmp.end = root->end; + if (tmp.start < min) + tmp.start = min; + if (tmp.end > max) + tmp.end = max; + tmp.start = ALIGN(tmp.start, align); if (alignf) - alignf(alignf_data, new, size, align); - if (start < end && end - start >= size - 1) { - new->start = start; - new->end = start + size - 1; + alignf(alignf_data, &tmp, size, align); + if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { + new->start = tmp.start; + new->end = tmp.start + size - 1; return 0; } if (!this) break; - start = this->end + 1; + tmp.start = this->end + 1; this = this->sibling; } return -EBUSY; -- cgit v1.2.1 From c757bea93bea4b77ebd181cc6dca60c15e3b1a2c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 21 Dec 2009 22:35:16 -0500 Subject: tracing: Fix setting tracer specific options The function __set_tracer_option() takes as its last parameter a "neg" value. If set it should negate the value of the option. The trace_options_write() passed the value written to the file which is what the new value needs to be set as. But since this is not the negative, it never sets the value. Reported-by: Peter Zijlstra Cc: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee61915935d5..d0a4c12d1f1c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3949,7 +3949,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); ret = __set_tracer_option(current_trace, topt->flags, - topt->opt, val); + topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) return ret; -- cgit v1.2.1 From 628ff7c1d8d8466a5ad8078bd0206a130f8b8a51 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 18 Dec 2009 09:41:24 -0800 Subject: anonfd: Allow making anon files read-only It seems a couple places such as arch/ia64/kernel/perfmon.c and drivers/infiniband/core/uverbs_main.c could use anon_inode_getfile() instead of a private pseudo-fs + alloc_file(), if only there were a way to get a read-only file. So provide this by having anon_inode_getfile() create a read-only file if we pass O_RDONLY in flags. Signed-off-by: Roland Dreier Signed-off-by: Al Viro --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e0eb4a2fe183..1f38270f08c7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4724,7 +4724,7 @@ SYSCALL_DEFINE5(perf_event_open, if (IS_ERR(event)) goto err_put_context; - err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); if (err < 0) goto err_free_put_context; -- cgit v1.2.1 From 5300990c0370e804e49d9a59d928c5d53fb73487 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 10:15:07 -0500 Subject: Sanitize f_flags helpers * pull ACC_MODE to fs.h; we have several copies all over the place * nightmarish expression calculating f_mode by f_flags deserves a helper too (OPEN_FMODE(flags)) Signed-off-by: Al Viro --- kernel/auditsc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 267e484f0198..fc0f928167e7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -250,7 +250,6 @@ struct audit_context { #endif }; -#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) static inline int open_arg(int flags, int mask) { int n = ACC_MODE(flags); -- cgit v1.2.1 From 83f57a11d84460dfe2afdb5a8bc759953428e38b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 22 Dec 2009 14:10:37 -0800 Subject: Revert "time: Remove xtime_cache" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as requested by John Stultz. Quoting John: "Petr TitÄ›ra reported an issue where he saw odd atime regressions with 2.6.33 where there were a full second worth of nanoseconds in the nanoseconds field. He also reviewed the time code and narrowed down the problem: unhandled overflow of the nanosecond field caused by rounding up the sub-nanosecond accumulated time. Details: * At the end of update_wall_time(), we currently round up the sub-nanosecond portion of accumulated time when storing it into xtime. This was added to avoid time inconsistencies caused when the sub-nanosecond portion was truncated when storing into xtime. Unfortunately we don't handle the possible second overflow caused by that rounding. * Previously the xtime_cache code hid this overflow by normalizing the xtime value when storing into the xtime_cache. * We could try to handle the second overflow after the rounding up, but since this affects the timekeeping's internal state, this would further complicate the next accumulation cycle, causing small errors in ntp steering. As much as I'd like to get rid of it, the xtime_cache code is known to work. * The correct fix is really to include the sub-nanosecond portion in the timekeeping accessor function, so we don't need to round up at during accumulation. This would greatly simplify the accumulation code. Unfortunately, we can't do this safely until the last three non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those patches are in -mm) and we kill off the spots where arches set xtime directly. This is all 2.6.34 material, so I think reverting the xtime_cache change is the best approach for now. Many thanks to Petr for both reporting and finding the issue!" Reported-by: Petr TitÄ›ra Requested-by: john stultz Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/time.c | 1 + kernel/time/timekeeping.c | 27 +++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index c6324d96009e..804798005d19 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -136,6 +136,7 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; + update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index af4135f05825..7faaa32fbf4f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -165,6 +165,13 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; +static struct timespec xtime_cache __attribute__ ((aligned (16))); +void update_xtime_cache(u64 nsec) +{ + xtime_cache = xtime; + timespec_add_ns(&xtime_cache, nsec); +} + /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { @@ -325,6 +332,8 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; + update_xtime_cache(0); + timekeeper.ntp_error = 0; ntp_clear(); @@ -550,6 +559,7 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); + update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -583,6 +593,7 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } + update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -722,6 +733,7 @@ static void timekeeping_adjust(s64 offset) timekeeper.ntp_error_shift; } + /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -765,6 +777,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) return offset; } + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -774,6 +787,7 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; + u64 nsecs; int shift = 0, maxshift; /* Make sure we're fully resumed: */ @@ -839,6 +853,9 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; + nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); + update_xtime_cache(nsecs); + /* check to see if there is a new clocksource to use */ update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); } @@ -875,13 +892,13 @@ void monotonic_to_bootbased(struct timespec *ts) unsigned long get_seconds(void) { - return xtime.tv_sec; + return xtime_cache.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime; + return xtime_cache; } struct timespec current_kernel_time(void) @@ -891,7 +908,8 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime; + + now = xtime_cache; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -905,7 +923,8 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime; + + now = xtime_cache; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); -- cgit v1.2.1 From 45465487897a1c6d508b14b904dc5777f7ec7e04 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:26 -0800 Subject: kfifo: move struct kfifo in place This is a new generic kernel FIFO implementation. The current kernel fifo API is not very widely used, because it has to many constrains. Only 17 files in the current 2.6.31-rc5 used it. FIFO's are like list's a very basic thing and a kfifo API which handles the most use case would save a lot of development time and memory resources. I think this are the reasons why kfifo is not in use: - The API is to simple, important functions are missing - A fifo can be only allocated dynamically - There is a requirement of a spinlock whether you need it or not - There is no support for data records inside a fifo So I decided to extend the kfifo in a more generic way without blowing up the API to much. The new API has the following benefits: - Generic usage: For kernel internal use and/or device driver. - Provide an API for the most use case. - Slim API: The whole API provides 25 functions. - Linux style habit. - DECLARE_KFIFO, DEFINE_KFIFO and INIT_KFIFO Macros - Direct copy_to_user from the fifo and copy_from_user into the fifo. - The kfifo itself is an in place member of the using data structure, this save an indirection access and does not waste the kernel allocator. - Lockless access: if only one reader and one writer is active on the fifo, which is the common use case, no additional locking is necessary. - Remove spinlock - give the user the freedom of choice what kind of locking to use if one is required. - Ability to handle records. Three type of records are supported: - Variable length records between 0-255 bytes, with a record size field of 1 bytes. - Variable length records between 0-65535 bytes, with a record size field of 2 bytes. - Fixed size records, which no record size field. - Preserve memory resource. - Performance! - Easy to use! This patch: Since most users want to have the kfifo as part of another object, reorganize the code to allow including struct kfifo in another data structure. This requires changing the kfifo_alloc and kfifo_init prototypes so that we pass an existing kfifo pointer into them. This patch changes the implementation and all existing users. [akpm@linux-foundation.org: fix warning] Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 65 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 3765ff3c1bbe..8da6bb9782bb 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -1,6 +1,7 @@ /* - * A simple kernel FIFO implementation. + * A generic kernel FIFO implementation. * + * Copyright (C) 2009 Stefani Seibold * Copyright (C) 2004 Stelian Pop * * This program is free software; you can redistribute it and/or modify @@ -26,49 +27,51 @@ #include #include +static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, + unsigned int size, spinlock_t *lock) +{ + fifo->buffer = buffer; + fifo->size = size; + fifo->lock = lock; + + kfifo_reset(fifo); +} + /** - * kfifo_init - allocates a new FIFO using a preallocated buffer + * kfifo_init - initialize a FIFO using a preallocated buffer + * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. * @size: the size of the internal buffer, this have to be a power of 2. - * @gfp_mask: get_free_pages mask, passed to kmalloc() * @lock: the lock to be used to protect the fifo buffer * - * Do NOT pass the kfifo to kfifo_free() after use! Simply free the - * &struct kfifo with kfree(). */ -struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, - gfp_t gfp_mask, spinlock_t *lock) +void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size, + spinlock_t *lock) { - struct kfifo *fifo; - /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); - fifo = kmalloc(sizeof(struct kfifo), gfp_mask); - if (!fifo) - return ERR_PTR(-ENOMEM); - - fifo->buffer = buffer; - fifo->size = size; - fifo->in = fifo->out = 0; - fifo->lock = lock; - - return fifo; + _kfifo_init(fifo, buffer, size, lock); } EXPORT_SYMBOL(kfifo_init); /** - * kfifo_alloc - allocates a new FIFO and its internal buffer - * @size: the size of the internal buffer to be allocated. + * kfifo_alloc - allocates a new FIFO internal buffer + * @fifo: the fifo to assign then new buffer + * @size: the size of the buffer to be allocated, this have to be a power of 2. * @gfp_mask: get_free_pages mask, passed to kmalloc() * @lock: the lock to be used to protect the fifo buffer * + * This function dynamically allocates a new fifo internal buffer + * * The size will be rounded-up to a power of 2. + * The buffer will be release with kfifo_free(). + * Return 0 if no error, otherwise the an error code */ -struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) +int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, + spinlock_t *lock) { unsigned char *buffer; - struct kfifo *ret; /* * round up to the next power of 2, since our 'let the indices @@ -80,26 +83,24 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) } buffer = kmalloc(size, gfp_mask); - if (!buffer) - return ERR_PTR(-ENOMEM); - - ret = kfifo_init(buffer, size, gfp_mask, lock); + if (!buffer) { + _kfifo_init(fifo, 0, 0, NULL); + return -ENOMEM; + } - if (IS_ERR(ret)) - kfree(buffer); + _kfifo_init(fifo, buffer, size, lock); - return ret; + return 0; } EXPORT_SYMBOL(kfifo_alloc); /** - * kfifo_free - frees the FIFO + * kfifo_free - frees the FIFO internal buffer * @fifo: the fifo to be freed. */ void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); - kfree(fifo); } EXPORT_SYMBOL(kfifo_free); -- cgit v1.2.1 From c1e13f25674ed564948ecb7dfe5f83e578892896 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:27 -0800 Subject: kfifo: move out spinlock Move the pointer to the spinlock out of struct kfifo. Most users in tree do not actually use a spinlock, so the few exceptions now have to call kfifo_{get,put}_locked, which takes an extra argument to a spinlock. Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 8da6bb9782bb..4950bdbe3477 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -28,11 +28,10 @@ #include static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, - unsigned int size, spinlock_t *lock) + unsigned int size) { fifo->buffer = buffer; fifo->size = size; - fifo->lock = lock; kfifo_reset(fifo); } @@ -42,16 +41,14 @@ static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. * @size: the size of the internal buffer, this have to be a power of 2. - * @lock: the lock to be used to protect the fifo buffer * */ -void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size, - spinlock_t *lock) +void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) { /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); - _kfifo_init(fifo, buffer, size, lock); + _kfifo_init(fifo, buffer, size); } EXPORT_SYMBOL(kfifo_init); @@ -60,7 +57,6 @@ EXPORT_SYMBOL(kfifo_init); * @fifo: the fifo to assign then new buffer * @size: the size of the buffer to be allocated, this have to be a power of 2. * @gfp_mask: get_free_pages mask, passed to kmalloc() - * @lock: the lock to be used to protect the fifo buffer * * This function dynamically allocates a new fifo internal buffer * @@ -68,8 +64,7 @@ EXPORT_SYMBOL(kfifo_init); * The buffer will be release with kfifo_free(). * Return 0 if no error, otherwise the an error code */ -int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, - spinlock_t *lock) +int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) { unsigned char *buffer; @@ -84,11 +79,11 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0, NULL); + _kfifo_init(fifo, 0, 0); return -ENOMEM; } - _kfifo_init(fifo, buffer, size, lock); + _kfifo_init(fifo, buffer, size); return 0; } -- cgit v1.2.1 From e64c026dd09b73faf20707711402fc5ed55a8e70 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:28 -0800 Subject: kfifo: cleanup namespace change name of __kfifo_* functions to kfifo_*, because the prefix __kfifo should be reserved for internal functions only. Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 4950bdbe3477..963ffde4af1a 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -100,7 +100,7 @@ void kfifo_free(struct kfifo *fifo) EXPORT_SYMBOL(kfifo_free); /** - * __kfifo_put - puts some data into the FIFO, no locking version + * kfifo_put - puts some data into the FIFO, no locking version * @fifo: the fifo to be used. * @buffer: the data to be added. * @len: the length of the data to be added. @@ -112,7 +112,7 @@ EXPORT_SYMBOL(kfifo_free); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int __kfifo_put(struct kfifo *fifo, +unsigned int kfifo_put(struct kfifo *fifo, const unsigned char *buffer, unsigned int len) { unsigned int l; @@ -144,10 +144,10 @@ unsigned int __kfifo_put(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(__kfifo_put); +EXPORT_SYMBOL(kfifo_put); /** - * __kfifo_get - gets some data from the FIFO, no locking version + * kfifo_get - gets some data from the FIFO, no locking version * @fifo: the fifo to be used. * @buffer: where the data must be copied. * @len: the size of the destination buffer. @@ -158,7 +158,7 @@ EXPORT_SYMBOL(__kfifo_put); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int __kfifo_get(struct kfifo *fifo, +unsigned int kfifo_get(struct kfifo *fifo, unsigned char *buffer, unsigned int len) { unsigned int l; @@ -190,4 +190,4 @@ unsigned int __kfifo_get(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(__kfifo_get); +EXPORT_SYMBOL(kfifo_get); -- cgit v1.2.1 From 7acd72eb85f1c7a15e8b5eb554994949241737f1 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:28 -0800 Subject: kfifo: rename kfifo_put... into kfifo_in... and kfifo_get... into kfifo_out... rename kfifo_put... into kfifo_in... to prevent miss use of old non in kernel-tree drivers ditto for kfifo_get... -> kfifo_out... Improve the prototypes of kfifo_in and kfifo_out to make the kerneldoc annotations more readable. Add mini "howto porting to the new API" in kfifo.h Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 963ffde4af1a..d659442e73f2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -100,20 +100,20 @@ void kfifo_free(struct kfifo *fifo) EXPORT_SYMBOL(kfifo_free); /** - * kfifo_put - puts some data into the FIFO, no locking version + * kfifo_in - puts some data into the FIFO * @fifo: the fifo to be used. - * @buffer: the data to be added. + * @from: the data to be added. * @len: the length of the data to be added. * - * This function copies at most @len bytes from the @buffer into + * This function copies at most @len bytes from the @from buffer into * the FIFO depending on the free space, and returns the number of * bytes copied. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_put(struct kfifo *fifo, - const unsigned char *buffer, unsigned int len) +unsigned int kfifo_in(struct kfifo *fifo, + const unsigned char *from, unsigned int len) { unsigned int l; @@ -128,10 +128,10 @@ unsigned int kfifo_put(struct kfifo *fifo, /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); - memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); + memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), from, l); /* then put the rest (if any) at the beginning of the buffer */ - memcpy(fifo->buffer, buffer + l, len - l); + memcpy(fifo->buffer, from + l, len - l); /* * Ensure that we add the bytes to the kfifo -before- @@ -144,22 +144,22 @@ unsigned int kfifo_put(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(kfifo_put); +EXPORT_SYMBOL(kfifo_in); /** - * kfifo_get - gets some data from the FIFO, no locking version + * kfifo_out - gets some data from the FIFO * @fifo: the fifo to be used. - * @buffer: where the data must be copied. + * @to: where the data must be copied. * @len: the size of the destination buffer. * * This function copies at most @len bytes from the FIFO into the - * @buffer and returns the number of copied bytes. + * @to buffer and returns the number of copied bytes. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_get(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, + unsigned char *to, unsigned int len) { unsigned int l; @@ -174,10 +174,10 @@ unsigned int kfifo_get(struct kfifo *fifo, /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); - memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); + memcpy(to, fifo->buffer + (fifo->out & (fifo->size - 1)), l); /* then get the rest (if any) from the beginning of the buffer */ - memcpy(buffer + l, fifo->buffer, len - l); + memcpy(to + l, fifo->buffer, len - l); /* * Ensure that we remove the bytes from the kfifo -before- @@ -190,4 +190,4 @@ unsigned int kfifo_get(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(kfifo_get); +EXPORT_SYMBOL(kfifo_out); -- cgit v1.2.1 From a121f24accac1600bf5b6fb1e12eeabdfed7cb1a Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:31 -0800 Subject: kfifo: add kfifo_skip, kfifo_from_user and kfifo_to_user Add kfifo_reset_out() for save lockless discard the fifo output Add kfifo_skip() to skip a number of output bytes Add kfifo_from_user() to copy user space data into the fifo Add kfifo_to_user() to copy fifo data to user space Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 123 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index d659442e73f2..2a78425ef67f 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -26,6 +26,7 @@ #include #include #include +#include static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) @@ -99,6 +100,21 @@ void kfifo_free(struct kfifo *fifo) } EXPORT_SYMBOL(kfifo_free); +/** + * kfifo_skip - skip output data + * @fifo: the fifo to be used. + * @len: number of bytes to skip + */ +void kfifo_skip(struct kfifo *fifo, unsigned int len) +{ + if (len < kfifo_len(fifo)) { + __kfifo_add_out(fifo, len); + return; + } + kfifo_reset_out(fifo); +} +EXPORT_SYMBOL(kfifo_skip); + /** * kfifo_in - puts some data into the FIFO * @fifo: the fifo to be used. @@ -115,6 +131,7 @@ EXPORT_SYMBOL(kfifo_free); unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, unsigned int len) { + unsigned int off; unsigned int l; len = min(len, fifo->size - fifo->in + fifo->out); @@ -126,21 +143,16 @@ unsigned int kfifo_in(struct kfifo *fifo, smp_mb(); + off = __kfifo_off(fifo, fifo->in); + /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); - memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), from, l); + l = min(len, fifo->size - off); + memcpy(fifo->buffer + off, from, l); /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, from + l, len - l); - /* - * Ensure that we add the bytes to the kfifo -before- - * we update the fifo->in index. - */ - - smp_wmb(); - - fifo->in += len; + __kfifo_add_in(fifo, len); return len; } @@ -161,6 +173,7 @@ EXPORT_SYMBOL(kfifo_in); unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) { + unsigned int off; unsigned int l; len = min(len, fifo->in - fifo->out); @@ -172,22 +185,116 @@ unsigned int kfifo_out(struct kfifo *fifo, smp_rmb(); + off = __kfifo_off(fifo, fifo->out); + /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); - memcpy(to, fifo->buffer + (fifo->out & (fifo->size - 1)), l); + l = min(len, fifo->size - off); + memcpy(to, fifo->buffer + off, l); /* then get the rest (if any) from the beginning of the buffer */ memcpy(to + l, fifo->buffer, len - l); + __kfifo_add_out(fifo, len); + + return len; +} +EXPORT_SYMBOL(kfifo_out); + +/** + * kfifo_from_user - puts some data from user space into the FIFO + * @fifo: the fifo to be used. + * @from: pointer to the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from into the + * FIFO depending and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len) +{ + unsigned int off; + unsigned int l; + int ret; + + len = min(len, fifo->size - fifo->in + fifo->out); + /* - * Ensure that we remove the bytes from the kfifo -before- - * we update the fifo->out index. + * Ensure that we sample the fifo->out index -before- we + * start putting bytes into the kfifo. */ smp_mb(); - fifo->out += len; + off = __kfifo_off(fifo, fifo->in); + + /* first put the data starting from fifo->in to buffer end */ + l = min(len, fifo->size - off); + ret = copy_from_user(fifo->buffer + off, from, l); + + if (unlikely(ret)) + return l - ret; + + /* then put the rest (if any) at the beginning of the buffer */ + ret = copy_from_user(fifo->buffer, from + l, len - l); + + if (unlikely(ret)) + return len - ret; + + __kfifo_add_in(fifo, len); return len; } -EXPORT_SYMBOL(kfifo_out); +EXPORT_SYMBOL(kfifo_from_user); + +/** + * kfifo_to_user - gets data from the FIFO and write it to user space + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * + * This function copies at most @len bytes from the FIFO into the + * @to buffer and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len) +{ + unsigned int off; + unsigned int l; + int ret; + + len = min(len, fifo->in - fifo->out); + + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + + off = __kfifo_off(fifo, fifo->out); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - off); + ret = copy_to_user(to, fifo->buffer + off, l); + + if (unlikely(ret)) + return l - ret; + + /* then get the rest (if any) from the beginning of the buffer */ + ret = copy_to_user(to + l, fifo->buffer, len - l); + + if (unlikely(ret)) + return len - ret; + + __kfifo_add_out(fifo, len); + + return len; +} +EXPORT_SYMBOL(kfifo_to_user); + -- cgit v1.2.1 From 86d4880313603810901f639ccb5c88ff13d4ad3c Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:32 -0800 Subject: kfifo: add record handling functions Add kfifo_in_rec() - puts some record data into the FIFO Add kfifo_out_rec() - gets some record data from the FIFO Add kfifo_from_user_rec() - puts some data from user space into the FIFO Add kfifo_to_user_rec() - gets data from the FIFO and write it to user space Add kfifo_peek_rec() - gets the size of the next FIFO record field Add kfifo_skip_rec() - skip the next fifo out record Add kfifo_avail_rec() - determinate the number of bytes available in a record FIFO Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 286 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 193 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 2a78425ef67f..e92d519f93b1 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -115,27 +115,11 @@ void kfifo_skip(struct kfifo *fifo, unsigned int len) } EXPORT_SYMBOL(kfifo_skip); -/** - * kfifo_in - puts some data into the FIFO - * @fifo: the fifo to be used. - * @from: the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @from buffer into - * the FIFO depending on the free space, and returns the number of - * bytes copied. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_in(struct kfifo *fifo, - const unsigned char *from, unsigned int len) +static inline void __kfifo_in_data(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; - len = min(len, fifo->size - fifo->in + fifo->out); - /* * Ensure that we sample the fifo->out index -before- we * start putting bytes into the kfifo. @@ -143,7 +127,7 @@ unsigned int kfifo_in(struct kfifo *fifo, smp_mb(); - off = __kfifo_off(fifo, fifo->in); + off = __kfifo_off(fifo, fifo->in + off); /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); @@ -151,33 +135,13 @@ unsigned int kfifo_in(struct kfifo *fifo, /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, from + l, len - l); - - __kfifo_add_in(fifo, len); - - return len; } -EXPORT_SYMBOL(kfifo_in); -/** - * kfifo_out - gets some data from the FIFO - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * - * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_out(struct kfifo *fifo, - unsigned char *to, unsigned int len) +static inline void __kfifo_out_data(struct kfifo *fifo, + void *to, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; - len = min(len, fifo->in - fifo->out); - /* * Ensure that we sample the fifo->in index -before- we * start removing bytes from the kfifo. @@ -185,7 +149,7 @@ unsigned int kfifo_out(struct kfifo *fifo, smp_rmb(); - off = __kfifo_off(fifo, fifo->out); + off = __kfifo_off(fifo, fifo->out + off); /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - off); @@ -193,34 +157,14 @@ unsigned int kfifo_out(struct kfifo *fifo, /* then get the rest (if any) from the beginning of the buffer */ memcpy(to + l, fifo->buffer, len - l); - - __kfifo_add_out(fifo, len); - - return len; } -EXPORT_SYMBOL(kfifo_out); -/** - * kfifo_from_user - puts some data from user space into the FIFO - * @fifo: the fifo to be used. - * @from: pointer to the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @from into the - * FIFO depending and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len) +static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; int ret; - len = min(len, fifo->size - fifo->in + fifo->out); - /* * Ensure that we sample the fifo->out index -before- we * start putting bytes into the kfifo. @@ -228,29 +172,101 @@ unsigned int kfifo_from_user(struct kfifo *fifo, smp_mb(); - off = __kfifo_off(fifo, fifo->in); + off = __kfifo_off(fifo, fifo->in + off); /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); ret = copy_from_user(fifo->buffer + off, from, l); if (unlikely(ret)) - return l - ret; + return ret + len - l; /* then put the rest (if any) at the beginning of the buffer */ - ret = copy_from_user(fifo->buffer, from + l, len - l); + return copy_from_user(fifo->buffer, from + l, len - l); +} + +static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int off) +{ + unsigned int l; + int ret; + + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + + off = __kfifo_off(fifo, fifo->out + off); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - off); + ret = copy_to_user(to, fifo->buffer + off, l); if (unlikely(ret)) - return len - ret; + return ret + len - l; - __kfifo_add_in(fifo, len); + /* then get the rest (if any) from the beginning of the buffer */ + return copy_to_user(to + l, fifo->buffer, len - l); +} +unsigned int __kfifo_in_n(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int recsize) +{ + if (kfifo_avail(fifo) < len + recsize) + return len + 1; + + __kfifo_in_data(fifo, from, len, recsize); + return 0; +} +EXPORT_SYMBOL(__kfifo_in_n); + +/** + * kfifo_in - puts some data into the FIFO + * @fifo: the fifo to be used. + * @from: the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from buffer into + * the FIFO depending on the free space, and returns the number of + * bytes copied. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, + unsigned int len) +{ + len = min(kfifo_avail(fifo), len); + + __kfifo_in_data(fifo, from, len, 0); + __kfifo_add_in(fifo, len); return len; } -EXPORT_SYMBOL(kfifo_from_user); +EXPORT_SYMBOL(kfifo_in); + +unsigned int __kfifo_in_generic(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int recsize) +{ + return __kfifo_in_rec(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_in_generic); + +unsigned int __kfifo_out_n(struct kfifo *fifo, + void *to, unsigned int len, unsigned int recsize) +{ + if (kfifo_len(fifo) < len + recsize) + return len; + + __kfifo_out_data(fifo, to, len, recsize); + __kfifo_add_out(fifo, len + recsize); + return 0; +} +EXPORT_SYMBOL(__kfifo_out_n); /** - * kfifo_to_user - gets data from the FIFO and write it to user space + * kfifo_out - gets some data from the FIFO * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. @@ -261,40 +277,124 @@ EXPORT_SYMBOL(kfifo_from_user); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) { - unsigned int off; - unsigned int l; - int ret; + len = min(kfifo_len(fifo), len); - len = min(len, fifo->in - fifo->out); + __kfifo_out_data(fifo, to, len, 0); + __kfifo_add_out(fifo, len); - /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. - */ + return len; +} +EXPORT_SYMBOL(kfifo_out); - smp_rmb(); +unsigned int __kfifo_out_generic(struct kfifo *fifo, + void *to, unsigned int len, unsigned int recsize, + unsigned int *total) +{ + return __kfifo_out_rec(fifo, to, len, recsize, total); +} +EXPORT_SYMBOL(__kfifo_out_generic); - off = __kfifo_off(fifo, fifo->out); +unsigned int __kfifo_from_user_n(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int recsize) +{ + if (kfifo_avail(fifo) < len + recsize) + return len + 1; - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - off); - ret = copy_to_user(to, fifo->buffer + off, l); + return __kfifo_from_user_data(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_from_user_n); - if (unlikely(ret)) - return l - ret; +/** + * kfifo_from_user - puts some data from user space into the FIFO + * @fifo: the fifo to be used. + * @from: pointer to the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from into the + * FIFO depending and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len) +{ + len = min(kfifo_avail(fifo), len); + len -= __kfifo_from_user_data(fifo, from, len, 0); + __kfifo_add_in(fifo, len); + return len; +} +EXPORT_SYMBOL(kfifo_from_user); - /* then get the rest (if any) from the beginning of the buffer */ - ret = copy_to_user(to + l, fifo->buffer, len - l); +unsigned int __kfifo_from_user_generic(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int recsize) +{ + return __kfifo_from_user_rec(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_from_user_generic); - if (unlikely(ret)) - return len - ret; +unsigned int __kfifo_to_user_n(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int reclen, + unsigned int recsize) +{ + unsigned int ret; - __kfifo_add_out(fifo, len); + if (kfifo_len(fifo) < reclen + recsize) + return len; + ret = __kfifo_to_user_data(fifo, to, reclen, recsize); + + if (likely(ret == 0)) + __kfifo_add_out(fifo, reclen + recsize); + + return ret; +} +EXPORT_SYMBOL(__kfifo_to_user_n); + +/** + * kfifo_to_user - gets data from the FIFO and write it to user space + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * + * This function copies at most @len bytes from the FIFO into the + * @to buffer and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len) +{ + len = min(kfifo_len(fifo), len); + len -= __kfifo_to_user_data(fifo, to, len, 0); + __kfifo_add_out(fifo, len); return len; } EXPORT_SYMBOL(kfifo_to_user); +unsigned int __kfifo_to_user_generic(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int recsize, + unsigned int *total) +{ + return __kfifo_to_user_rec(fifo, to, len, recsize, total); +} +EXPORT_SYMBOL(__kfifo_to_user_generic); + +unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) +{ + if (recsize == 0) + return kfifo_avail(fifo); + + return __kfifo_peek_n(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_peek_generic); + +void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) +{ + __kfifo_skip_rec(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_skip_generic); + -- cgit v1.2.1 From 0c69774e6ce94364cfaa8bdeb18061edc414bc5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Dec 2009 15:43:19 +0100 Subject: sched: Revert 738d2be, simplify set_task_cpu() Effectively reverts 738d2be4301007f054541c5c4bf7fb6a361c9b3a. As demonstrated by Eric, we really need to call __set_task_cpu() early in the fork() path to properly initialize the various task state -- specifically the cgroup state through set_task_rq(). [ we could probably fix this by explicitly calling __set_task_cpu() from sched_fork(), but lets try that for the next cycle and simply revert to the old behaviour for now. ] Reported-by: Eric Paris Tested-by: Eric Paris , Signed-off-by: Peter Zijlstra Cc: efault@gmx.de LKML-Reference: <1261492999.4937.36.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 87f1f47beffe..c535cc4f6428 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2045,11 +2045,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); - if (task_cpu(p) == new_cpu) - return; - - p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + if (task_cpu(p) != new_cpu) { + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + } __set_task_cpu(p, new_cpu); } -- cgit v1.2.1 From 4440095c8268c1a5e11577097d2be429cec036ca Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Dec 2009 21:00:20 +0100 Subject: SYSCTL: Print binary sysctl warnings (nearly) only once When printing legacy sysctls print the warning message for each of them only once. This way there is a guarantee the syslog won't be flooded for any sane program. The original attempt at this made the tables non const and stored the flag inline. Linus suggested using a separate hash table for this, this is based on a code snippet from him. The hash implies this is not exact and can sometimes not print a new sysctl due to a hash collision, but in practice this should not be a problem I used a FNV32 hash over the binary string with a 32byte bitmap. This gives relatively little collisions when all the predefined binary sysctls are hashed: size 256 bucket length number 0: [25] 1: [67] 2: [88] 3: [47] 4: [22] 5: [6] 6: [1] The worst case is a single collision of 6 hash values. Signed-off-by: Andi Kleen --- kernel/sysctl_binary.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 112533d5fc08..8f5d16e0707a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1417,6 +1417,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen) return; } +#define WARN_ONCE_HASH_BITS 8 +#define WARN_ONCE_HASH_SIZE (1< Date: Mon, 21 Dec 2009 13:02:24 +0100 Subject: kprobes: Fix distinct type warning Every time I see this: kernel/kprobes.c: In function 'register_kretprobe': kernel/kprobes.c:1038: warning: comparison of distinct pointer types lacks a cast I'm wondering if something changed in common code and we need to do something for s390. Apparently that's not the case. Let's get rid of this annoying warning. Signed-off-by: Heiko Carstens Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu LKML-Reference: <20091221120224.GA4471@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5342a344c43..b7df302a0204 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1035,7 +1035,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * num_possible_cpus()); + rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #else rp->maxactive = num_possible_cpus(); #endif -- cgit v1.2.1 From 40892367bc893f3abf6f5ca8ac2ed1c98ba26a77 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 21 Dec 2009 12:01:17 -0800 Subject: tracing: Kconfig spelling fixes and cleanups Fix filename reference (ftrace-implementation.txt -> ftrace-design.txt). Fix spelling, punctuation, grammar. Fix help text indentation and line lengths to reduce need for horizontal scrolling or larger window sizes. Signed-off-by: Randy Dunlap Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091221120117.3fb49cdc.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 112 +++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d006554888dc..6c22d8a2f289 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -12,17 +12,17 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_TRACER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool @@ -34,17 +34,17 @@ config HAVE_FUNCTION_GRAPH_FP_TEST config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_DYNAMIC_FTRACE bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FTRACE_MCOUNT_RECORD bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_HW_BRANCH_TRACER bool @@ -52,7 +52,7 @@ config HAVE_HW_BRANCH_TRACER config HAVE_SYSCALL_TRACEPOINTS bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config TRACER_MAX_TRACE bool @@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options. +# hiding of the automatic options. config TRACING bool @@ -119,7 +119,7 @@ menuconfig FTRACE bool "Tracers" default y if DEBUG_KERNEL help - Enable the kernel tracing infrastructure. + Enable the kernel tracing infrastructure. if FTRACE @@ -133,7 +133,7 @@ config FUNCTION_TRACER help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation - instruction to the beginning of every kernel function, which NOP + instruction at the beginning of every kernel function, which NOP sequence is then dynamically patched into a tracer call when tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very @@ -150,7 +150,7 @@ config FUNCTION_GRAPH_TRACER and its entry. Its first purpose is to trace the duration of functions and draw a call graph for each thread with some information like - the return value. This is done by setting the current return + the return value. This is done by setting the current return address on the current task structure into a stack of calls. @@ -173,7 +173,7 @@ config IRQSOFF_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the preempt-off timing option can be used together or separately.) @@ -186,7 +186,7 @@ config PREEMPT_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP help - This option measures the time spent in preemption off critical + This option measures the time spent in preemption-off critical sections, with microsecond accuracy. The default measurement method is a maximum search, which is @@ -195,7 +195,7 @@ config PREEMPT_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the irqs-off timing option can be used together or separately.) @@ -222,7 +222,7 @@ config ENABLE_DEFAULT_TRACERS depends on !GENERIC_TRACER select TRACING help - This tracer hooks to various trace points in the kernel + This tracer hooks to various trace points in the kernel, allowing the user to pick and choose which trace point they want to trace. It also includes the sched_switch tracer plugin. @@ -265,19 +265,19 @@ choice The likely/unlikely profiler only looks at the conditions that are annotated with a likely or unlikely macro. - The "all branch" profiler will profile every if statement in the + The "all branch" profiler will profile every if-statement in the kernel. This profiler will also enable the likely/unlikely - profiler as well. + profiler. - Either of the above profilers add a bit of overhead to the system. - If unsure choose "No branch profiling". + Either of the above profilers adds a bit of overhead to the system. + If unsure, choose "No branch profiling". config BRANCH_PROFILE_NONE bool "No branch profiling" help - No branch profiling. Branch profiling adds a bit of overhead. - Only enable it if you want to analyse the branching behavior. - Otherwise keep it disabled. + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" @@ -288,7 +288,7 @@ config PROFILE_ANNOTATED_BRANCHES /sys/kernel/debug/tracing/profile_annotated_branch - Note: this will add a significant overhead, only turn this + Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES @@ -305,7 +305,7 @@ config PROFILE_ALL_BRANCHES This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system - is to be analyzed + is to be analyzed in much detail. endchoice config TRACING_BRANCHES @@ -335,7 +335,7 @@ config POWER_TRACER depends on X86 select GENERIC_TRACER help - This tracer helps developers to analyze and optimize the kernels + This tracer helps developers to analyze and optimize the kernel's power management decisions, specifically the C-state and P-state behavior. @@ -391,14 +391,14 @@ config HW_BRANCH_TRACER select GENERIC_TRACER help This tracer records all branches on the system in a circular - buffer giving access to the last N branches for each cpu. + buffer, giving access to the last N branches for each cpu. config KMEMTRACE bool "Trace SLAB allocations" select GENERIC_TRACER help kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected data is then fed to the userspace application in order to analyse allocation hotspots, internal fragmentation and so on, making it possible to see how well an allocator performs, as well as debug @@ -417,15 +417,15 @@ config WORKQUEUE_TRACER bool "Trace workqueues" select GENERIC_TRACER help - The workqueue tracer provides some statistical informations + The workqueue tracer provides some statistical information about each cpu workqueue thread such as the number of the works inserted and executed since their creation. It can help - to evaluate the amount of work each of them have to perform. + to evaluate the amount of work each of them has to perform. For example it can help a developer to decide whether he should - choose a per cpu workqueue instead of a singlethreaded one. + choose a per-cpu workqueue instead of a singlethreaded one. config BLK_DEV_IO_TRACE - bool "Support for tracing block io actions" + bool "Support for tracing block IO actions" depends on SYSFS depends on BLOCK select RELAY @@ -456,15 +456,15 @@ config KPROBE_EVENT select TRACING default y help - This allows the user to add tracing events (similar to tracepoints) on the fly - via the ftrace interface. See Documentation/trace/kprobetrace.txt - for more details. + This allows the user to add tracing events (similar to tracepoints) + on the fly via the ftrace interface. See + Documentation/trace/kprobetrace.txt for more details. Those events can be inserted wherever kprobes can probe, and record various register and memory values. - This option is also required by perf-probe subcommand of perf tools. If - you want to use perf tools, this option is strongly recommended. + This option is also required by perf-probe subcommand of perf tools. + If you want to use perf tools, this option is strongly recommended. config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" @@ -472,32 +472,32 @@ config DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replaces them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replace them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. - This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise - has native performance as long as no tracing is active. + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. A file is created - in debugfs called function_profile_enabled which defaults to zero. - When a 1 is echoed into this file profiling begins, and when a - zero is entered, profiling stops. A file in the trace_stats - directory called functions, that show the list of functions that - have been hit and their counters. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stats directory; this file shows the list of functions that + have been hit and their counters. - If in doubt, say N + If in doubt, say N. config FTRACE_MCOUNT_RECORD def_bool y @@ -556,8 +556,8 @@ config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER help - This option creates a test to stress the ring buffer and bench mark it. - It creates its own ring buffer such that it will not interfer with + This option creates a test to stress the ring buffer and benchmark it. + It creates its own ring buffer such that it will not interfere with any other users of the ring buffer (such as ftrace). It then creates a producer and consumer that will run for 10 seconds and sleep for 10 seconds. Each interval it will print out the number of events @@ -566,7 +566,7 @@ config RING_BUFFER_BENCHMARK It does not disable interrupts or raise its priority, so it may be affected by processes that are running. - If unsure, say N + If unsure, say N. endif # FTRACE -- cgit v1.2.1 From 88f7a890d74137ab0d126a5d65679cd620f1a289 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:22:22 +0800 Subject: ksym_tracer: Fix to make the tracer work ksym tracer doesn't work: # echo tasklist_lock:rw- > ksym_trace_filter -bash: echo: write error: No such device It's because we pass to perf_event_create_kernel_counter() a cpu number which is not present. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF19E.1010201@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 10 +++++++--- kernel/trace/trace_ksym.c | 1 - 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 366eedf949c0..48fb0bb6992a 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -388,7 +389,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, if (!cpu_events) return ERR_PTR(-ENOMEM); - for_each_possible_cpu(cpu) { + get_online_cpus(); + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); @@ -399,18 +401,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, goto fail; } } + put_online_cpus(); return cpu_events; fail: - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); if (IS_ERR(*pevent)) break; unregister_hw_breakpoint(*pevent); } + put_online_cpus(); + free_percpu(cpu_events); - /* return the error if any */ return ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index faf37fa4408c..340b6ff193e0 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -197,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) entry->attr.bp_addr = addr; entry->attr.bp_len = HW_BREAKPOINT_LEN_4; - ret = -EAGAIN; entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, ksym_hbp_handler); -- cgit v1.2.1 From 3d13ec2efdb5843ad91e57b60d50b44d922cf063 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:23:19 +0800 Subject: ksym_tracer: Fix to allow writing newline to ksym_trace_filter It used to work, but now doesn't: # echo > ksym_filter bash: echo: write error: Invalid argument It's caused by d954fbf0ff6b5fdfb32350e85a2f15d3db976506 ("tracing: Fix wrong usage of strstrip in trace_ksyms"). Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF1D7.5040400@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 340b6ff193e0..160a8d8b37a2 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -299,8 +299,8 @@ static ssize_t ksym_trace_filter_write(struct file *file, * 2: echo 0 > ksym_trace_filter * 3: echo "*:---" > ksym_trace_filter */ - if (!buf[0] || !strcmp(buf, "0") || - !strcmp(buf, "*:---")) { + if (!input_string[0] || !strcmp(input_string, "0") || + !strcmp(input_string, "*:---")) { __ksym_trace_reset(); ret = 0; goto out; -- cgit v1.2.1 From e6d9491bf8ba6728cc86aeabbc688d20ec0563b5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:23:40 +0800 Subject: ksym_tracer: Fix race when incrementing count We are under rcu read section but not holding the write lock, so count++ is not atomic. Use atomic64_t instead. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF1EC.9010608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 160a8d8b37a2..67d79f709fc5 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -32,6 +32,8 @@ #include #include +#include + /* * For now, let us restrict the no. of symbols traced simultaneously to number * of available hardware breakpoint registers. @@ -44,7 +46,7 @@ struct trace_ksym { struct perf_event **ksym_hbp; struct perf_event_attr attr; #ifdef CONFIG_PROFILE_KSYM_TRACER - unsigned long counter; + atomic64_t counter; #endif struct hlist_node ksym_hlist; }; @@ -69,9 +71,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) rcu_read_lock(); hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - if ((entry->attr.bp_addr == hbp_hit_addr) && - (entry->counter <= MAX_UL_INT)) { - entry->counter++; + if (entry->attr.bp_addr == hbp_hit_addr) { + atomic64_inc(&entry->counter); break; } } @@ -501,7 +502,8 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v) seq_printf(m, " %-36s", fn_name); else seq_printf(m, " %-36s", ""); - seq_printf(m, " %15lu\n", entry->counter); + seq_printf(m, " %15llu\n", + (unsigned long long)atomic64_read(&entry->counter)); return 0; } -- cgit v1.2.1 From 53ab668064edaeef99c0ee22799483d45f4c81f6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:24:03 +0800 Subject: ksym_tracer: Remove trace_stat trace_stat is problematic. Don't use it, use seqfile instead. This fixes a race that reading the stat file is not protected by any lock, which can lead to use after free. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF203.40200@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 127 ++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 67d79f709fc5..94103cdcf9d8 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -26,7 +26,6 @@ #include #include "trace_output.h" -#include "trace_stat.h" #include "trace.h" #include @@ -444,103 +443,77 @@ struct tracer ksym_tracer __read_mostly = .print_line = ksym_trace_output }; -__init static int init_ksym_trace(void) -{ - struct dentry *d_tracer; - struct dentry *entry; - - d_tracer = tracing_init_dentry(); - ksym_filter_entry_count = 0; - - entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, - NULL, &ksym_tracing_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'ksym_trace_filter' file\n"); - - return register_tracer(&ksym_tracer); -} -device_initcall(init_ksym_trace); - - #ifdef CONFIG_PROFILE_KSYM_TRACER -static int ksym_tracer_stat_headers(struct seq_file *m) +static int ksym_profile_show(struct seq_file *m, void *v) { + struct hlist_node *node; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + seq_puts(m, " Access Type "); seq_puts(m, " Symbol Counter\n"); seq_puts(m, " ----------- "); seq_puts(m, " ------ -------\n"); - return 0; -} -static int ksym_tracer_stat_show(struct seq_file *m, void *v) -{ - struct hlist_node *stat = v; - struct trace_ksym *entry; - int access_type = 0; - char fn_name[KSYM_NAME_LEN]; + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + access_type = entry->attr.bp_type; - access_type = entry->attr.bp_type; + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } - switch (access_type) { - case HW_BREAKPOINT_R: - seq_puts(m, " R "); - break; - case HW_BREAKPOINT_W: - seq_puts(m, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - seq_puts(m, " RW "); - break; - default: - seq_puts(m, " NA "); + if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", ""); + seq_printf(m, " %15llu\n", + (unsigned long long)atomic64_read(&entry->counter)); } - - if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) - seq_printf(m, " %-36s", fn_name); - else - seq_printf(m, " %-36s", ""); - seq_printf(m, " %15llu\n", - (unsigned long long)atomic64_read(&entry->counter)); + rcu_read_unlock(); return 0; } -static void *ksym_tracer_stat_start(struct tracer_stat *trace) +static int ksym_profile_open(struct inode *node, struct file *file) { - return ksym_filter_head.first; -} - -static void * -ksym_tracer_stat_next(void *v, int idx) -{ - struct hlist_node *stat = v; - - return stat->next; + return single_open(file, ksym_profile_show, NULL); } -static struct tracer_stat ksym_tracer_stats = { - .name = "ksym_tracer", - .stat_start = ksym_tracer_stat_start, - .stat_next = ksym_tracer_stat_next, - .stat_headers = ksym_tracer_stat_headers, - .stat_show = ksym_tracer_stat_show +static const struct file_operations ksym_profile_fops = { + .open = ksym_profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; +#endif /* CONFIG_PROFILE_KSYM_TRACER */ -__init static int ksym_tracer_stat_init(void) +__init static int init_ksym_trace(void) { - int ret; + struct dentry *d_tracer; - ret = register_stat_tracer(&ksym_tracer_stats); - if (ret) { - printk(KERN_WARNING "Warning: could not register " - "ksym tracer stats\n"); - return 1; - } + d_tracer = tracing_init_dentry(); - return 0; + trace_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + trace_create_file("ksym_profile", 0444, d_tracer, + NULL, &ksym_profile_fops); +#endif + + return register_tracer(&ksym_tracer); } -fs_initcall(ksym_tracer_stat_init); -#endif /* CONFIG_PROFILE_KSYM_TRACER */ +device_initcall(init_ksym_trace); -- cgit v1.2.1 From 79b408210885b9f7f0b067b07a09d68f4da3a700 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:19 +0800 Subject: tracing/kprobe: Show sign of fields in trace_kprobe format files The format files of trace_kprobe do not show the sign of the fields. The other format files show the field signed type of the fields and this patch makes the trace_kprobe formats consistent with the others. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D27.5040009@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ecab06547a5..83f1e6ef7063 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1182,10 +1182,11 @@ static int __probe_event_show_format(struct trace_seq *s, #undef SHOW_FIELD #define SHOW_FIELD(type, item, name) \ do { \ - ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ - "offset:%u;\tsize:%u;\n", name, \ + ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ + "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type)); \ + (unsigned int)sizeof(type), \ + is_signed_type(type)); \ if (!ret) \ return 0; \ } while (0) -- cgit v1.2.1 From fb7ae981cb9fe8665b9da97e8734745e030c151d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:38 +0800 Subject: tracing: Fix sign fields in ftrace_define_fields_##call() Add is_signed_type() call to trace_define_field() in ftrace macros. The code previously just passed in 0 (false), disregarding whether or not the field was actually a signed type. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D3A.6020007@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 458e5bfe26d0..d4fa5dc1ee4e 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -158,7 +158,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0, FILTER_OTHER); \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; @@ -168,8 +169,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), \ container.item), \ - sizeof(field.container.item), 0, \ - FILTER_OTHER); \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -- cgit v1.2.1 From 05cbaa2853cdfc255fdd04e65a82bfe9208c4e52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Dec 2009 16:00:35 +0100 Subject: perf: Fix NULL deref in inheritance code Liming found a NULL deref when a task has a perf context but no counters when it forks. This can occur in two cases, a race during construction where the fork hits after installing the context but before the first counter gets inserted, or more reproducably, a fork after the last counter is closed (which leaves the context around). Reported-by: Wang Liming Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras CC: LKML-Reference: <1262185684.7135.222.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 03cc061398d1..58ed1dae5875 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5148,7 +5148,7 @@ int perf_event_init_task(struct task_struct *child) GFP_KERNEL); if (!child_ctx) { ret = -ENOMEM; - goto exit; + break; } __perf_event_init_context(child_ctx, child); @@ -5164,7 +5164,7 @@ int perf_event_init_task(struct task_struct *child) } } - if (inherited_all) { + if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. @@ -5184,7 +5184,6 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } -exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); -- cgit v1.2.1 From 0f4bd46ec252887f44f1f065b41867cac8f70dfb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 22 Dec 2009 03:15:43 +0000 Subject: kmsg_dump: Dump on crash_kexec as well crash_kexec gets called before kmsg_dump(KMSG_DUMP_OOPS) if panic_on_oops is set, so the kernel log buffer is not stored for this case. This patch adds a KMSG_DUMP_KEXEC dump type which gets called when crash_kexec() is invoked. To avoid getting double dumps, the old KMSG_DUMP_PANIC is moved below crash_kexec(). The mtdoops driver is modified to handle KMSG_DUMP_KEXEC in the same way as a panic. Signed-off-by: KOSAKI Motohiro Acked-by: Simon Kagstrom Signed-off-by: David Woodhouse --- kernel/kexec.c | 4 ++++ kernel/panic.c | 3 ++- kernel/printk.c | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 433e9fcc1fc5..ae217488fef8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1074,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs) if (mutex_trylock(&kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; + + kmsg_dump(KMSG_DUMP_KEXEC); + crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); diff --git a/kernel/panic.c b/kernel/panic.c index 5827f7b97254..c787333282b8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -75,7 +75,6 @@ NORET_TYPE void panic(const char * fmt, ...) dump_stack(); #endif - kmsg_dump(KMSG_DUMP_PANIC); /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @@ -83,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...) */ crash_kexec(NULL); + kmsg_dump(KMSG_DUMP_PANIC); + /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic diff --git a/kernel/printk.c b/kernel/printk.c index 1ded8e7dd19b..2c9dc0b03a5e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1467,6 +1467,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static const char const *kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", + [KMSG_DUMP_KEXEC] = "kexec", }; static const char *kmsg_to_str(enum kmsg_dump_reason reason) -- cgit v1.2.1 From 10b465aaf9536ee5a16652fa0700740183d48ec9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 19 Dec 2009 14:43:01 +0000 Subject: modules: Skip empty sections when exporting section notes Commit 35dead4 "modules: don't export section names of empty sections via sysfs" changed the set of sections that have attributes, but did not change the iteration over these attributes in add_notes_attrs(). This can lead to add_notes_attrs() creating attributes with the wrong names or with null name pointers. Introduce a sect_empty() function and use it in both add_sect_attrs() and add_notes_attrs(). Reported-by: Martin Michlmayr Signed-off-by: Ben Hutchings Tested-by: Martin Michlmayr Cc: stable@kernel.org Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e96b8ed1cb6a..f82386bd9ee9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1010,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, * J. Corbet */ #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) + +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} + struct module_sect_attr { struct module_attribute mattr; @@ -1051,8 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC - && sechdrs[i].sh_size) + if (!sect_empty(&sechdrs[i])) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1070,9 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; for (i = 0; i < nsect; i++) { - if (! (sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!sechdrs[i].sh_size) + if (sect_empty(&sechdrs[i])) continue; sattr->address = sechdrs[i].sh_addr; sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, @@ -1156,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, /* Count notes sections and allocate structures. */ notes = 0; for (i = 0; i < nsect; i++) - if ((sechdrs[i].sh_flags & SHF_ALLOC) && + if (!sect_empty(&sechdrs[i]) && (sechdrs[i].sh_type == SHT_NOTE)) ++notes; @@ -1172,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; for (loaded = i = 0; i < nsect; ++i) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + if (sect_empty(&sechdrs[i])) continue; if (sechdrs[i].sh_type == SHT_NOTE) { nattr->attr.name = mod->sect_attrs->attrs[loaded].name; -- cgit v1.2.1 From 5ded3dc6a3c7549b36a8ac27bbd81b33756a2c29 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Wed, 6 Jan 2010 17:12:07 -0800 Subject: ring-buffer: Wrap a list.next reference with rb_list_head() This reference at the end of rb_get_reader_page() was causing off-by-one writes to the prev pointer of the page after the reader page when that page is the head page, and therefore the reader page has the RB_PAGE_HEAD flag in its list.next pointer. This eventually results in a GPF in a subsequent call to rb_set_head_page() (usually from rb_get_reader_page()) when that prev pointer is dereferenced. The dereferenced register would characteristically have an address that appears shifted left by one byte (eg, ffxxxxxxxxxxxxyy instead of ffffxxxxxxxxxxxx) due to being written at an address one byte too high. Signed-off-by: David Sharp LKML-Reference: <1262826727-9090-1-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2326b04c95c4..d5b7308b7e1b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2906,7 +2906,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * * Now make the new head point back to the reader page. */ - reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ -- cgit v1.2.1 From 0e1ff5d72a6393f2ef5dbf74f58bb55a12d63834 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Jan 2010 20:40:44 -0500 Subject: ring-buffer: Add rb_list_head() wrapper around new reader page next field If the very unlikely case happens where the writer moves the head by one between where the head page is read and where the new reader page is assigned _and_ the writer then writes and wraps the entire ring buffer so that the head page is back to what was originally read as the head page, the page to be swapped will have a corrupted next pointer. Simple solution is to wrap the assignment of the next pointer with a rb_list_head(). Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d5b7308b7e1b..edefe3b2801b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2869,7 +2869,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); - cpu_buffer->reader_page->list.next = reader->list.next; + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; /* -- cgit v1.2.1 From 8767ba2796a1c894e6d9524584a26a8224f0543d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 8 Jan 2010 14:42:38 -0800 Subject: kmod: fix resource leak in call_usermodehelper_pipe() Fix resource (write-pipe file) leak in call_usermodehelper_pipe(). When call_usermodehelper_exec() fails, write-pipe file is opened and call_usermodehelper_pipe() just returns an error. Since it is hard for caller to determine whether the error occured when opening the pipe or executing the helper, the caller cannot close the pipe by themselves. I've found this resoruce leak when testing coredump. You can check how the resource leaks as below; $ echo "|nocommand" > /proc/sys/kernel/core_pattern $ ulimit -c unlimited $ while [ 1 ]; do ./segv; done &> /dev/null & $ cat /proc/meminfo (<- repeat it) where segv.c is; //----- int main () { char *p = 0; *p = 1; } //----- This patch closes write-pipe file if call_usermodehelper_exec() failed. Signed-off-by: Masami Hiramatsu Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 25b103190364..bf0e231d9702 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, return -ENOMEM; ret = call_usermodehelper_stdinpipe(sub_info, filp); - if (ret < 0) - goto out; + if (ret < 0) { + call_usermodehelper_freeinfo(sub_info); + return ret; + } - return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (ret < 0) /* Failed to execute helper, close pipe */ + filp_close(*filp, NULL); - out: - call_usermodehelper_freeinfo(sub_info); return ret; } EXPORT_SYMBOL(call_usermodehelper_pipe); -- cgit v1.2.1 From bd4f490a079730aadfaf9a728303ea0135c01945 Mon Sep 17 00:00:00 2001 From: Dave Anderson Date: Fri, 8 Jan 2010 14:42:50 -0800 Subject: cgroups: fix 2.6.32 regression causing BUG_ON() in cgroup_diput() The LTP cgroup test suite generates a "kernel BUG at kernel/cgroup.c:790!" here in cgroup_diput(): /* * if we're getting rid of the cgroup, refcount should ensure * that there are no pidlists left. */ BUG_ON(!list_empty(&cgrp->pidlists)); The cgroup pidlist rework in 2.6.32 generates the BUG_ON, which is caused when pidlist_array_load() calls cgroup_pidlist_find(): (1) if a matching cgroup_pidlist is found, it down_write's the mutex of the pre-existing cgroup_pidlist, and increments its use_count. (2) if no matching cgroup_pidlist is found, then a new one is allocated, it down_write's its mutex, and the use_count is set to 0. (3) the matching, or new, cgroup_pidlist gets returned back to pidlist_array_load(), which increments its use_count -- regardless whether new or pre-existing -- and up_write's the mutex. So if a matching list is ever encountered by cgroup_pidlist_find() during the life of a cgroup directory, it results in an inflated use_count value, preventing it from ever getting released by cgroup_release_pid_array(). Then if the directory is subsequently removed, cgroup_diput() hits the BUG_ON() when it finds that the directory's cgroup is still populated with a pidlist. The patch simply removes the use_count increment when a matching pidlist is found by cgroup_pidlist_find(), because it gets bumped by the calling pidlist_array_load() function while still protected by the list's mutex. Signed-off-by: Dave Anderson Reviewed-by: Li Zefan Acked-by: Ben Blum Cc: Paul Menage Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0249f4be9b5c..1fbcc748044a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2468,7 +2468,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, /* make sure l doesn't vanish out from under us */ down_write(&l->mutex); mutex_unlock(&cgrp->pidlist_mutex); - l->use_count++; return l; } } -- cgit v1.2.1 From b45c6e76bc2c72f6426c14bed64fdcbc9bf37cb0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 8 Jan 2010 14:42:52 -0800 Subject: kernel/signal.c: fix kernel information leak with print-fatal-signals=1 When print-fatal-signals is enabled it's possible to dump any memory reachable by the kernel to the log by simply jumping to that address from user space. Or crash the system if there's some hardware with read side effects. The fatal signals handler will dump 16 bytes at the execution address, which is fully controlled by ring 3. In addition when something jumps to a unmapped address there will be up to 16 additional useless page faults, which might be potentially slow (and at least is not very efficient) Fortunately this option is off by default and only there on i386. But fix it by checking for kernel addresses and also stopping when there's a page fault. Signed-off-by: Andi Kleen Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d09692b40376..934ae5e687b9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -979,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) for (i = 0; i < 16; i++) { unsigned char insn; - __get_user(insn, (unsigned char *)(regs->ip + i)); + if (get_user(insn, (unsigned char *)(regs->ip + i))) + break; printk("%02x ", insn); } } -- cgit v1.2.1 From 7485d0d3758e8e6491a5c9468114e74dc050785d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 5 Jan 2010 16:32:43 +0900 Subject: futexes: Remove rw parameter from get_futex_key() Currently, futexes have two problem: A) The current futex code doesn't handle private file mappings properly. get_futex_key() uses PageAnon() to distinguish file and anon, which can cause the following bad scenario: 1) thread-A call futex(private-mapping, FUTEX_WAIT), it sleeps on file mapping object. 2) thread-B writes a variable and it makes it cow. 3) thread-B calls futex(private-mapping, FUTEX_WAKE), it wakes up blocked thread on the anonymous page. (but it's nothing) B) Current futex code doesn't handle zero page properly. Read mode get_user_pages() can return zero page, but current futex code doesn't handle it at all. Then, zero page makes infinite loop internally. The solution is to use write mode get_user_page() always for page lookup. It prevents the lookup of both file page of private mappings and zero page. Performance concerns: Probaly very little, because glibc always initialize variables for futex before to call futex(). It means glibc users never see the overhead of this patch. Compatibility concerns: This patch has few compatibility issues. After this patch, FUTEX_WAIT require writable access to futex variables (read-only mappings makes EFAULT). But practically it's not a problem, glibc always initalizes variables for futexes explicitly - nobody uses read-only mappings. Reported-by: Hugh Dickins Signed-off-by: KOSAKI Motohiro Acked-by: Peter Zijlstra Acked-by: Darren Hart Cc: Cc: Linus Torvalds Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: Ulrich Drepper LKML-Reference: <20100105162633.45A2.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8e3c3ffe1b9a..d9b3a2228f9d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, - * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); + err = get_user_pages_fast(address, 1, 1, &page); if (err < 0) return err; @@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1175,11 +1173,10 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, */ retry: q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &q->key); if (unlikely(ret != 0)) return ret; @@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.requeue_pi_key = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out; @@ -2023,7 +2020,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, rt_waiter.task = NULL; key2 = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; -- cgit v1.2.1 From 751e9983ee276cb150e8812b1d995f6035a63878 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:02 +0800 Subject: ftrace: Fix MATCH_END_ONLY function filter For '*foo' pattern, we should allow any string ending with 'foo', but ftrace filter incorrectly disallows strings like bar_foo_foo: # echo '*io' > set_ftrace_filter # cat set_ftrace_filter | grep 'req_bio_endio' # cat available_filter_functions | grep 'req_bio_endio' req_bio_endio Signed-off-by: Li Zefan LKML-Reference: <4B4E870E.6060607@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7968762c8167..1e6640f80454 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1690,7 +1690,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; - char *ptr; + int slen; switch (type) { case MATCH_FULL: @@ -1706,8 +1706,8 @@ static int ftrace_match(char *str, char *regex, int len, int type) matched = 1; break; case MATCH_END_ONLY: - ptr = strstr(str, regex); - if (ptr && (ptr[len] == 0)) + slen = strlen(str); + if (slen >= len && memcmp(str + slen - len, regex, len) == 0) matched = 1; break; } -- cgit v1.2.1 From 285caad415f459f336247932b4db95a571357a02 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:21 +0800 Subject: tracing/filters: Fix MATCH_FRONT_ONLY filter matching MATCH_FRONT_ONLY actually is a full matching: # ./perf record -R -f -a -e lock:lock_acquire \ --filter 'name ~rcu_*' sleep 1 # ./perf trace (no output) We should pass the length of the pattern string to strncmp(). Signed-off-by: Li Zefan LKML-Reference: <4B4E8721.5090301@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 50504cb228de..11c3973e6552 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -261,7 +261,7 @@ static int regex_match_full(char *str, struct regex *r, int len) static int regex_match_front(char *str, struct regex *r, int len) { - if (strncmp(str, r->pattern, len) == 0) + if (strncmp(str, r->pattern, r->len) == 0) return 1; return 0; } -- cgit v1.2.1 From a3291c14ecf0a995e30d993b7f2cae031de98727 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:41 +0800 Subject: tracing/filters: Fix MATCH_END_ONLY filter matching For '*foo' pattern, we should allow any string ending with 'foo', but event filtering incorrectly disallows strings like bar_foo_foo: Signed-off-by: Li Zefan LKML-Reference: <4B4E8735.6070604@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 11c3973e6552..49e44dd17851 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -275,9 +275,10 @@ static int regex_match_middle(char *str, struct regex *r, int len) static int regex_match_end(char *str, struct regex *r, int len) { - char *ptr = strstr(str, r->pattern); + int strlen = len - 1; - if (ptr && (ptr[r->len] == 0)) + if (strlen >= r->len && + memcmp(str + strlen - r->len, r->pattern, r->len) == 0) return 1; return 0; } -- cgit v1.2.1 From b2af211f284eb1bef19fbb85fc8ef551bb1e7460 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:11 +0800 Subject: tracing/filters: Fix MATCH_MIDDLE_ONLY filter matching The @str might not be NULL-terminated if it's of type DYN_STRING or STATIC_STRING, so we should use strnstr() instead of strstr(). Signed-off-by: Li Zefan LKML-Reference: <4B4E8753.2000102@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 49e44dd17851..f364b085397e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -268,7 +268,7 @@ static int regex_match_front(char *str, struct regex *r, int len) static int regex_match_middle(char *str, struct regex *r, int len) { - if (strstr(str, r->pattern)) + if (strnstr(str, r->pattern, len)) return 1; return 0; } -- cgit v1.2.1 From 16da27a8bc7a0d050686d1b2e9efb53fab9ed226 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:27 +0800 Subject: tracing/filters: Fix MATCH_FULL filter matching for PTR_STRING MATCH_FULL matching for PTR_STRING is not working correctly: # echo 'func == vt' > events/bkl/lock_kernel/filter # echo 1 > events/bkl/lock_kernel/enable ... # cat trace Xorg-1484 [000] 1973.392586: lock_kernel: ... func=vt_ioctl() gpm-1402 [001] 1974.027740: lock_kernel: ... func=vt_ioctl() We should pass to regex.match(..., len) the length (including '\0') of the source string instead of the length of the pattern string. Signed-off-by: Li Zefan LKML-Reference: <4B4E8763.5070707@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f364b085397e..60c2a4efad4a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, { char **addr = (char **)(event + pred->offset); int cmp, match; + int len = strlen(*addr) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); + cmp = pred->regex.match(*addr, &pred->regex, len); match = cmp ^ pred->not; @@ -782,10 +783,8 @@ static int filter_add_pred(struct filter_parse_state *ps, pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; - else { + else fn = filter_pred_pchar; - pred->regex.field_len = strlen(pred->regex.pattern); - } } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); -- cgit v1.2.1 From d1303dd1d6b220cab375f24fa91a5640e54e169e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:40 +0800 Subject: tracing/filters: Add comment for match callbacks We should be clear on 2 things: - the length parameter of a match callback includes tailing '\0'. - the string to be searched might not be NULL-terminated. Signed-off-by: Li Zefan LKML-Reference: <4B4E8770.7000608@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 60c2a4efad4a..e42af9aad69f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -252,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } -/* Basic regex callbacks */ +/* + * regex_match_foo - Basic regex callbacks + * + * @str: the string to be searched + * @r: the regex structure containing the pattern string + * @len: the length of the string to be searched (including '\0') + * + * Note: + * - @str might not be NULL-terminated if it's of type DYN_STRING + * or STATIC_STRING + */ + static int regex_match_full(char *str, struct regex *r, int len) { if (strncmp(str, r->pattern, len) == 0) -- cgit v1.2.1 From 8ecc2951534af10e04ddb5e5ff5c6d217b79f5c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:12 -0800 Subject: kfifo: use void * pointers for user buffers The pointers to user buffers are currently unsigned char *, which requires a lot of casting in the caller for any non-char typed buffers. Use void * instead. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index e92d519f93b1..ab615e695052 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -28,7 +28,7 @@ #include #include -static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, +static void _kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { fifo->buffer = buffer; @@ -44,7 +44,7 @@ static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, * @size: the size of the internal buffer, this have to be a power of 2. * */ -void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) +void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); @@ -235,7 +235,7 @@ EXPORT_SYMBOL(__kfifo_in_n); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, +unsigned int kfifo_in(struct kfifo *fifo, const void *from, unsigned int len) { len = min(kfifo_avail(fifo), len); @@ -277,7 +277,7 @@ EXPORT_SYMBOL(__kfifo_out_n); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) { len = min(kfifo_len(fifo), len); -- cgit v1.2.1 From 64ce1037c5434b1d036cd99ecaee6e00496bc2e9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:15 -0800 Subject: kfifo: sanitize *_user error handling Right now for kfifo_*_user it's not easily possible to distingush between a user copy failing and the FIFO not containing enough data. The problem is that both conditions are multiplexed into the same return code. Avoid this by moving the "copy length" into a separate output parameter and only return 0/-EFAULT in the main return value. I didn't fully adapt the weird "record" variants, those seem to be unused anyways and were rather messy (should they be just removed?) I would appreciate some double checking if I did all the conversions correctly. Signed-off-by: Andi Kleen Cc: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 76 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index ab615e695052..b50bb622e8b0 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -159,8 +159,9 @@ static inline void __kfifo_out_data(struct kfifo *fifo, memcpy(to + l, fifo->buffer, len - l); } -static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off) +static inline int __kfifo_from_user_data(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned *lenout) { unsigned int l; int ret; @@ -177,16 +178,20 @@ static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); ret = copy_from_user(fifo->buffer + off, from, l); - - if (unlikely(ret)) - return ret + len - l; + if (unlikely(ret)) { + *lenout = ret; + return -EFAULT; + } + *lenout = l; /* then put the rest (if any) at the beginning of the buffer */ - return copy_from_user(fifo->buffer, from + l, len - l); + ret = copy_from_user(fifo->buffer, from + l, len - l); + *lenout += ret ? ret : len - l; + return ret ? -EFAULT : 0; } -static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int off) +static inline int __kfifo_to_user_data(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int off, unsigned *lenout) { unsigned int l; int ret; @@ -203,12 +208,21 @@ static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - off); ret = copy_to_user(to, fifo->buffer + off, l); - - if (unlikely(ret)) - return ret + len - l; + *lenout = l; + if (unlikely(ret)) { + *lenout -= ret; + return -EFAULT; + } /* then get the rest (if any) from the beginning of the buffer */ - return copy_to_user(to + l, fifo->buffer, len - l); + len -= l; + ret = copy_to_user(to + l, fifo->buffer, len); + if (unlikely(ret)) { + *lenout += len - ret; + return -EFAULT; + } + *lenout += len; + return 0; } unsigned int __kfifo_in_n(struct kfifo *fifo, @@ -299,10 +313,13 @@ EXPORT_SYMBOL(__kfifo_out_generic); unsigned int __kfifo_from_user_n(struct kfifo *fifo, const void __user *from, unsigned int len, unsigned int recsize) { + unsigned total; + if (kfifo_avail(fifo) < len + recsize) return len + 1; - return __kfifo_from_user_data(fifo, from, len, recsize); + __kfifo_from_user_data(fifo, from, len, recsize, &total); + return total; } EXPORT_SYMBOL(__kfifo_from_user_n); @@ -313,18 +330,21 @@ EXPORT_SYMBOL(__kfifo_from_user_n); * @len: the length of the data to be added. * * This function copies at most @len bytes from the @from into the - * FIFO depending and returns the number of copied bytes. + * FIFO depending and returns -EFAULT/0. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len) +int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned *total) { + int ret; len = min(kfifo_avail(fifo), len); - len -= __kfifo_from_user_data(fifo, from, len, 0); + ret = __kfifo_from_user_data(fifo, from, len, 0, total); + if (ret) + return ret; __kfifo_add_in(fifo, len); - return len; + return 0; } EXPORT_SYMBOL(kfifo_from_user); @@ -339,17 +359,17 @@ unsigned int __kfifo_to_user_n(struct kfifo *fifo, void __user *to, unsigned int len, unsigned int reclen, unsigned int recsize) { - unsigned int ret; + unsigned int ret, total; if (kfifo_len(fifo) < reclen + recsize) return len; - ret = __kfifo_to_user_data(fifo, to, reclen, recsize); + ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); if (likely(ret == 0)) __kfifo_add_out(fifo, reclen + recsize); - return ret; + return total; } EXPORT_SYMBOL(__kfifo_to_user_n); @@ -358,20 +378,22 @@ EXPORT_SYMBOL(__kfifo_to_user_n); * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. + @ @lenout: pointer to output variable with copied data * * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. + * @to buffer and 0 or -EFAULT. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len) +int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned *lenout) { + int ret; len = min(kfifo_len(fifo), len); - len -= __kfifo_to_user_data(fifo, to, len, 0); - __kfifo_add_out(fifo, len); - return len; + ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); + __kfifo_add_out(fifo, *lenout); + return ret; } EXPORT_SYMBOL(kfifo_to_user); -- cgit v1.2.1 From a5b9e2c1063046421ce01dcf5ddd7ec12567f3e1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:16 -0800 Subject: kfifo: add kfifo_out_peek In some upcoming code it's useful to peek into a FIFO without permanentely removing data. This patch implements a new kfifo_out_peek() to do this. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index b50bb622e8b0..7384f120be87 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -302,6 +302,27 @@ unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) } EXPORT_SYMBOL(kfifo_out); +/** + * kfifo_out_peek - copy some data from the FIFO, but do not remove it + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * @offset: offset into the fifo + * + * This function copies at most @len bytes at @offset from the FIFO + * into the @to buffer and returns the number of copied bytes. + * The data is not removed from the FIFO. + */ +unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, + unsigned offset) +{ + len = min(kfifo_len(fifo), len + offset); + + __kfifo_out_data(fifo, to, len, offset); + return len; +} +EXPORT_SYMBOL(kfifo_out_peek); + unsigned int __kfifo_out_generic(struct kfifo *fifo, void *to, unsigned int len, unsigned int recsize, unsigned int *total) -- cgit v1.2.1 From 5dab600e6a153ceb64832f608069e6c08185411a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:17 -0800 Subject: kfifo: document everywhere that size has to be power of two On my first try using them I missed that the fifos need to be power of two, resulting in a runtime bug. Document that requirement everywhere (and fix one grammar bug) Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 7384f120be87..32c5c15d750d 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -41,7 +41,7 @@ static void _kfifo_init(struct kfifo *fifo, void *buffer, * kfifo_init - initialize a FIFO using a preallocated buffer * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. - * @size: the size of the internal buffer, this have to be a power of 2. + * @size: the size of the internal buffer, this has to be a power of 2. * */ void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) -- cgit v1.2.1 From af2422c42c0ff42b8b93dbb3a5fe65250fb65c40 Mon Sep 17 00:00:00 2001 From: David John Date: Fri, 15 Jan 2010 17:01:23 -0800 Subject: smp_call_function_any(): pass the node value to cpumask_of_node() The change in acpi_cpufreq to use smp_call_function_any causes a warning when it is called since the function erroneously passes the cpu id to cpumask_of_node rather than the node that the cpu is on. Fix this. cpumask_of_node(3): node > nr_node_ids(1) Pid: 1, comm: swapper Not tainted 2.6.33-rc3-00097-g2c1f189 #223 Call Trace: [] cpumask_of_node+0x23/0x58 [] smp_call_function_any+0x65/0xfa [] ? do_drv_read+0x0/0x2f [] get_cur_val+0xb0/0x102 [] get_cur_freq_on_cpu+0x74/0xc5 [] acpi_cpufreq_cpu_init+0x417/0x515 [] ? __down_write+0xb/0xd [] cpufreq_add_dev+0x278/0x922 Signed-off-by: David John Cc: Suresh Siddha Cc: Rusty Russell Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index de735a6637d0..f10408422444 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -347,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask, goto call; /* Try for same node. */ - nodemask = cpumask_of_node(cpu); + nodemask = cpumask_of_node(cpu_to_node(cpu)); for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; cpu = cpumask_next_and(cpu, nodemask, mask)) { if (cpu_online(cpu)) -- cgit v1.2.1 From ea9d8e3f45404d411c00ae67b45cc35c58265bb7 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 7 Jan 2010 11:22:44 +0800 Subject: clockevent: Don't remove broadcast device when cpu is dead Marc reported that the BUG_ON in clockevents_notify() triggers on his system. This happens because the kernel tries to remove an active clock event device (used for broadcasting) from the device list. The handling of devices which can be used as per cpu device and as a global broadcast device is suboptimal. The simplest solution for now (and for stable) is to check whether the device is used as global broadcast device, but this needs to be revisited. [ tglx: restored the cpuweight check and massaged the changelog ] Reported-by: Marc Dionne Tested-by: Marc Dionne Signed-off-by: Xiaotian Feng LKML-Reference: <1262834564-13033-1-git-send-email-dfeng@redhat.com> Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/time/clockevents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 6f740d9f0948..d7395fdfb9f3 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -259,7 +259,8 @@ void clockevents_notify(unsigned long reason, void *arg) cpu = *((int *)arg); list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && - cpumask_weight(dev->cpumask) == 1) { + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); list_del(&dev->list); } -- cgit v1.2.1 From 50b926e439620c469565e8be0f28be78f5fca1ce Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 4 Jan 2010 14:44:56 +0100 Subject: sched: Fix vmark regression on big machines SD_PREFER_SIBLING is set at the CPU domain level if power saving isn't enabled, leading to many cache misses on large machines as we traverse looking for an idle shared cache to wake to. Change the enabler of select_idle_sibling() to SD_SHARE_PKG_RESOURCES, and enable same at the sibling domain level. Reported-by: Lin Ming Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1262612696.15495.15.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 42ac3c9f66f6..8fe7ee81c552 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1508,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. */ - if (tmp->flags & SD_PREFER_SIBLING) + if (tmp->flags & SD_SHARE_PKG_RESOURCES) target = select_idle_sibling(p, tmp, target); if (target >= 0) { -- cgit v1.2.1 From 6d558c3ac9b6508d26fd5cadccce51fc9d726b1c Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Mon, 11 Jan 2010 14:21:25 +0800 Subject: sched: Reassign prev and switch_count when reacquire_kernel_lock() fail Assume A->B schedule is processing, if B have acquired BKL before and it need reschedule this time. Then on B's context, it will go to need_resched_nonpreemptible for reschedule. But at this time, prev and switch_count are related to A. It's wrong and will lead to incorrect scheduler statistics. Signed-off-by: Yong Zhang Signed-off-by: Peter Zijlstra LKML-Reference: <2674af741001102238w7b0ddcadref00d345e2181d11@mail.gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c535cc4f6428..4508fe7048be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5530,8 +5530,11 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) + if (unlikely(reacquire_kernel_lock(current) < 0)) { + prev = rq->curr; + switch_count = &prev->nivcsw; goto need_resched_nonpreemptible; + } preempt_enable_no_resched(); if (need_resched()) -- cgit v1.2.1 From fe432200abb0d64f409895168d9ad8fbb9d8e6c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2010 09:08:26 +0100 Subject: perf: Fix perf_event_do_pending() fallback callsite Paul questioned the context in which we should call perf_event_do_pending(). After looking at that I found that it should be called from IRQ context these days, however the fallback call-site is placed in softirq context. Ammend this by placing the callback in the IRQ timer path. Reported-by: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1263374859.4244.192.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/timer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 15533b792397..c61a7949387f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1198,6 +1198,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); + perf_event_do_pending(); scheduler_tick(); run_posix_cpu_timers(p); } @@ -1209,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_event_do_pending(); - hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) -- cgit v1.2.1 From 22e190851f8709c48baf00ed9ce6144cdc54d025 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2010 09:12:32 +0100 Subject: perf: Honour event state for aux stream data Anton reported that perf record kept receiving events even after calling ioctl(PERF_EVENT_IOC_DISABLE). It turns out that FORK,COMM and MMAP events didn't respect the disabled state and kept flowing in. Reported-by: Anton Blanchard Signed-off-by: Peter Zijlstra Tested-by: Anton Blanchard LKML-Reference: <1263459187.4244.265.camel@laptop> CC: stable@kernel.org Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 603c0d8b5df1..d27746bd3a06 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3268,6 +3268,9 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -3377,6 +3380,9 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -3494,6 +3500,9 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; -- cgit v1.2.1 From fabf318e5e4bda0aca2b0d617b191884fda62703 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Jan 2010 21:04:57 +0100 Subject: sched: Fix fork vs hotplug vs cpuset namespaces There are a number of issues: 1) TASK_WAKING vs cgroup_clone (cpusets) copy_process(): sched_fork() child->state = TASK_WAKING; /* waiting for wake_up_new_task() */ if (current->nsproxy != p->nsproxy) ns_cgroup_clone() cgroup_clone() mutex_lock(inode->i_mutex) mutex_lock(cgroup_mutex) cgroup_attach_task() ss->can_attach() ss->attach() [ -> cpuset_attach() ] cpuset_attach_task() set_cpus_allowed_ptr(); while (child->state == TASK_WAKING) cpu_relax(); will deadlock the system. 2) cgroup_clone (cpusets) vs copy_process So even if the above would work we still have: copy_process(): if (current->nsproxy != p->nsproxy) ns_cgroup_clone() cgroup_clone() mutex_lock(inode->i_mutex) mutex_lock(cgroup_mutex) cgroup_attach_task() ss->can_attach() ss->attach() [ -> cpuset_attach() ] cpuset_attach_task() set_cpus_allowed_ptr(); ... p->cpus_allowed = current->cpus_allowed over-writing the modified cpus_allowed. 3) fork() vs hotplug if we unplug the child's cpu after the sanity check when the child gets attached to the task_list but before wake_up_new_task() shit will meet with fan. Solve all these issues by moving fork cpu selection into wake_up_new_task(). Reported-by: Serge E. Hallyn Tested-by: Serge E. Hallyn Signed-off-by: Peter Zijlstra LKML-Reference: <1264106190.4283.1314.camel@laptop> Signed-off-by: Thomas Gleixner --- kernel/fork.c | 15 --------------- kernel/sched.c | 39 +++++++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5b2959b3ffc2..f88bd984df35 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); - /* - * The task hasn't been attached yet, so its cpus_allowed mask will - * not be changed, nor will its assigned CPU. - * - * The cpus_allowed mask of the parent may have changed after it was - * copied first time - so re-copy it here, then check the child's CPU - * to ensure it is on a valid CPU (and if not, just force it back to - * parent's CPU). This avoids alot of nasty races. - */ - p->cpus_allowed = current->cpus_allowed; - p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || - !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; diff --git a/kernel/sched.c b/kernel/sched.c index 4508fe7048be..3a8fb30a91b1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * Called from: + * Gets called from 3 sites (exec, fork, wakeup), since it is called without + * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done + * by: * - * - fork, @p is stable because it isn't on the tasklist yet - * - * - exec, @p is unstable, retry loop - * - * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so - * we should be good. + * exec: is unstable, retry loop + * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) @@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); -#ifdef CONFIG_SMP - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); -#endif set_task_cpu(p, cpu); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + * + * We still have TASK_WAKING but PF_STARTING is gone now, meaning + * ->cpus_allowed is stable, we have preemption disabled, meaning + * cpu_online_mask is stable. + */ + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); + set_task_cpu(p, cpu); +#endif rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_WAKING); @@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); + put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -7139,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) * the ->cpus_allowed mask from under waking tasks, which would be * possible when we change rq->lock in ttwu(), so synchronize against * TASK_WAKING to avoid that. + * + * Make an exception for freshly cloned tasks, since cpuset namespaces + * might move the task about, we have to validate the target in + * wake_up_new_task() anyway since the cpu might have gone away. */ again: - while (p->state == TASK_WAKING) + while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) cpu_relax(); rq = task_rq_lock(p, &flags); - if (p->state == TASK_WAKING) { + if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { task_rq_unlock(rq, &flags); goto again; } -- cgit v1.2.1 From 74bf4076f2ed79b5510440b72a561823a8852ec0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 25 Jan 2010 15:11:53 -0500 Subject: tracing: Prevent kernel oops with corrupted buffer If the contents of the ftrace ring buffer gets corrupted and the trace file is read, it could create a kernel oops (usualy just killing the user task thread). This is caused by the checking of the pid in the buffer. If the pid is negative, it still references the cmdline cache array, which could point to an invalid address. The simple fix is to test for negative PIDs. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0df1b0f2cb9e..eac6875cb990 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -951,6 +951,11 @@ void trace_find_cmdline(int pid, char comm[]) return; } + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, ""); + return; + } + if (pid > PID_MAX_DEFAULT) { strcpy(comm, "<...>"); return; -- cgit v1.2.1 From 7b7422a566aa0dc1e582ce263d4c7ff4a772700a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 Jan 2010 12:51:10 +0100 Subject: clocksource: Prevent potential kgdb dead lock commit 0f8e8ef7 (clocksource: Simplify clocksource watchdog resume logic) introduced a potential kgdb dead lock. When the kernel is stopped by kgdb inside code which holds watchdog_lock then kgdb dead locks in clocksource_resume_watchdog(). clocksource_resume_watchdog() is called from kbdg via clocksource_touch_watchdog() to avoid that the clock source watchdog marks TSC unstable after the kernel has been stopped. Solve this by replacing spin_lock with a spin_trylock and just return in case the lock is held. Not resetting the watchdog might result in TSC becoming marked unstable, but that's an acceptable penalty for using kgdb. The timekeeping is anyway easily screwed up by kgdb when the system uses either jiffies or a clock source which wraps in short intervals (e.g. pm_timer wraps about every 4.6s), so we really do not have to worry about that occasional TSC marked unstable side effect. The second caller of clocksource_resume_watchdog() is clocksource_resume(). The trylock is safe here as well because the system is UP at this point, interrupts are disabled and nothing else can hold watchdog_lock(). Reported-by: Jason Wessel LKML-Reference: <1264480000-6997-4-git-send-email-jason.wessel@windriver.com> Cc: kgdb-bugreport@lists.sourceforge.net Cc: Martin Schwidefsky Cc: John Stultz Cc: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e85c23404d34..13700833c181 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void) { unsigned long flags; - spin_lock_irqsave(&watchdog_lock, flags); + /* + * We use trylock here to avoid a potential dead lock when + * kgdb calls this code after the kernel has been stopped with + * watchdog_lock held. When watchdog_lock is held we just + * return and accept, that the watchdog might trigger and mark + * the monitored clock source (usually TSC) unstable. + * + * This does not affect the other caller clocksource_resume() + * because at this point the kernel is UP, interrupts are + * disabled and nothing can hold watchdog_lock. + */ + if (!spin_trylock_irqsave(&watchdog_lock, flags)) + return; clocksource_reset_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -458,8 +470,8 @@ void clocksource_resume(void) * clocksource_touch_watchdog - Update watchdog * * Update the watchdog after exception contexts such as kgdb so as not - * to incorrectly trip the watchdog. - * + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. */ void clocksource_touch_watchdog(void) { -- cgit v1.2.1 From 492a74f4210e15f4701422e2e1c4cd3c1e45ddae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 25 Jan 2010 15:17:47 -0500 Subject: ring-buffer: Check if ring buffer iterator has stale data Usually reads of the ring buffer is performed by a single task. There are two types of reads from the ring buffer. One is a consuming read which will consume the entry that was read and the next read will be the entry that follows. The other is an iterator that will let the user read the contents of the ring buffer without modifying it. When an iterator is allocated, writes to the ring buffer are disabled to protect the iterator. The problem exists when consuming reads happen while an iterator is allocated. Specifically, the kind of read that swaps out an entire page (used by splice) and replaces it with a new read. If the iterator is on the page that is swapped out, then the next read may read from this swapped out page and return garbage. This patch adds a check when reading the iterator to make sure that the iterator contents are still valid. If a consuming read has taken place, the iterator is reset. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index edefe3b2801b..503b630e0bda 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -464,6 +464,8 @@ struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; u64 read_stamp; }; @@ -2716,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; + iter->cache_reader_page = cpu_buffer->reader_page; + iter->cache_read = cpu_buffer->read; } /** @@ -3066,6 +3070,15 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + again: /* * We repeat when a timestamp is encountered. -- cgit v1.2.1 From 3c05d7482777f15e71bb4cb1ba78dee2800dfec6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 26 Jan 2010 16:14:08 -0500 Subject: ring-buffer: Check for end of page in iterator If the iterator comes to an empty page for some reason, or if the page is emptied by a consuming read. The iterator code currently does not check if the iterator is pass the contents, and may return a false entry. This patch adds a check to the ring buffer iterator to test if the current page has been completely read and sets the iterator to the next page if necessary. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 503b630e0bda..8c1b2d290718 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3064,9 +3064,6 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; - if (ring_buffer_iter_empty(iter)) - return NULL; - cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; @@ -3080,6 +3077,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) rb_iter_reset(iter); again: + if (ring_buffer_iter_empty(iter)) + return NULL; + /* * We repeat when a timestamp is encountered. * We can get multiple timestamps by nested interrupts or also @@ -3094,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) if (rb_per_cpu_empty(cpu_buffer)) return NULL; + if (iter->head >= local_read(&iter->head_page->page->commit)) { + rb_inc_iter(iter); + goto again; + } + event = rb_iter_head_event(iter); switch (event->type_len) { -- cgit v1.2.1 From 03688970347bfea32823953a7ce5886d1713205f Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 22 Jan 2010 08:12:47 -0500 Subject: tracing/documentation: Cover new frame pointer semantics Update the graph tracer examples to cover the new frame pointer semantics (in terms of passing it along). Move the HAVE_FUNCTION_GRAPH_FP_TEST docs out of the Kconfig, into the right place, and expand on the details. Signed-off-by: Mike Frysinger LKML-Reference: <1264165967-18938-1-git-send-email-vapier@gentoo.org> Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6c22d8a2f289..60e2ce0181ee 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER config HAVE_FUNCTION_GRAPH_FP_TEST bool help - An arch may pass in a unique value (frame pointer) to both the - entering and exiting of a function. On exit, the value is compared - and if it does not match, then it will panic the kernel. + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool -- cgit v1.2.1 From 48d50674179981e41f432167b2441cec782d5484 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 26 Jan 2010 19:16:41 +0100 Subject: lockdep: Fix check_usage_backwards() error message Lockdep has found the real bug, but the output doesn't look right to me: > ========================================================= > [ INFO: possible irq lock inversion dependency detected ] > 2.6.33-rc5 #77 > --------------------------------------------------------- > emacs/1609 just changed the state of lock: > (&(&tty->ctrl_lock)->rlock){+.....}, at: [] tty_fasync+0xe8/0x190 > but this lock took another, HARDIRQ-unsafe lock in the past: > (&(&sighand->siglock)->rlock){-.....} "HARDIRQ-unsafe" and "this lock took another" looks wrong, afaics. > ... key at: [] __key.46539+0x0/0x8 > ... acquired at: > [] __lock_acquire+0x1056/0x15a0 > [] lock_acquire+0x9f/0x120 > [] _raw_spin_lock_irqsave+0x52/0x90 > [] __proc_set_tty+0x3e/0x150 > [] tty_open+0x51d/0x5e0 The stack-trace shows that this lock (ctrl_lock) was taken under ->siglock (which is hopefully irq-safe). This is a clear typo in check_usage_backwards() where we tell the print a fancy routine we're forwards. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: <20100126181641.GA10460@redhat.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5feaddcdbe49..c62ec14609b9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, return ret; return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + this, 0, irqclass); } void print_irqtrace_events(struct task_struct *curr) -- cgit v1.2.1 From 11854247e2c851e7ff9ce138e501c6cffc5a4217 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Jan 2010 16:34:27 +0100 Subject: sched: Fix incorrect sanity check We moved to migrate on wakeup, which means that sleeping tasks could still be present on offline cpus. Amend the check to only test running tasks. Reported-by: Heiko Carstens Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 1c8ddd6ee940..08e54e7beaed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -151,7 +151,7 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { - if (task_cpu(p) == cpu && + if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ -- cgit v1.2.1 From 9d3cfc4c1d17c6d3bc1373e3b954c56b92607755 Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Mon, 25 Jan 2010 14:56:34 +0100 Subject: sched: Correct printk whitespace in warning from cpu down task check Due to an incorrect line break the output currently contains tabs. Also remove trailing space. The actual output that logcheck sent me looked like this: Task events/1 (pid = 10) is on cpu 1^I^I^I^I(state = 1, flags = 84208040) After this patch it becomes: Task events/1 (pid = 10) is on cpu 1 (state = 1, flags = 84208040) Signed-off-by: Frans Pop Signed-off-by: Peter Zijlstra LKML-Reference: <201001251456.34996.elendil@planet.nl> Signed-off-by: Ingo Molnar --- kernel/cpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 08e54e7beaed..677f25376a38 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -154,10 +154,10 @@ static inline void check_for_tasks(int cpu) if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ - (state = %ld, flags = %x) \n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " + "(state = %ld, flags = %x)\n", + p->comm, task_pid_nr(p), cpu, + p->state, p->flags); } write_unlock_irq(&tasklist_lock); } -- cgit v1.2.1 From b23ff0e9330e4b11e18af984d50573598e10e7f9 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 21 Jan 2010 18:25:16 +0530 Subject: hw_breakpoints: Release the bp slot if arch_validate_hwbkpt_settings() fails. On a given architecture, when hardware breakpoint registration fails due to un-supported access type (read/write/execute), we lose the bp slot since register_perf_hw_breakpoint() does not release the bp slot on failure. Hence, any subsequent hardware breakpoint registration starts failing with 'no space left on device' error. This patch introduces error handling in register_perf_hw_breakpoint() function and releases bp slot on error. Signed-off-by: Mahesh Salgaonkar Cc: Ananth N Mavinakayanahalli Cc: K. Prasad Cc: Maneesh Soni LKML-Reference: <20100121125516.GA32521@in.ibm.com> Signed-off-by: Frederic Weisbecker --- kernel/hw_breakpoint.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 50dbd5999588..c030ae657f20 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -296,6 +296,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (!bp->attr.disabled || !bp->overflow_handler) ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + /* if arch_validate_hwbkpt_settings() fails then release bp slot */ + if (ret) + release_bp_slot(bp); + return ret; } -- cgit v1.2.1 From cc0967490c1c3824bc5b75718b6ca8a51d9f2617 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:42 -0600 Subject: x86, hw_breakpoints, kgdb: Fix kgdb to use hw_breakpoint API In the 2.6.33 kernel, the hw_breakpoint API is now used for the performance event counters. The hw_breakpoint_handler() now consumes the hw breakpoints that were previously set by kgdb arch specific code. In order for kgdb to work in conjunction with this core API change, kgdb must use some of the low level functions of the hw_breakpoint API to install, uninstall, and deal with hw breakpoint reservations. The kgdb core required a change to call kgdb_disable_hw_debug anytime a slave cpu enters kgdb_wait() in order to keep all the hw breakpoints in sync as well as to prevent hitting a hw breakpoint while kgdb is active. During the architecture specific initialization of kgdb, it will pre-allocate 4 disabled (struct perf event **) structures. Kgdb will use these to manage the capabilities for the 4 hw breakpoint registers, per cpu. Right now the hw_breakpoint API does not have a way to ask how many breakpoints are available, on each CPU so it is possible that the install of a breakpoint might fail when kgdb restores the system to the run state. The intent of this patch is to first get the basic functionality of hw breakpoints working and leave it to the person debugging the kernel to understand what hw breakpoints are in use and what restrictions have been imposed as a result. Breakpoint constraints will be dealt with in a future patch. While atomic, the x86 specific kgdb code will call arch_uninstall_hw_breakpoint() and arch_install_hw_breakpoint() to manage the cpu specific hw breakpoints. The net result of these changes allow kgdb to use the same pool of hw_breakpoints that are used by the perf event API, but neither knows about future reservations for the available hw breakpoint slots. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- kernel/kgdb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e23514..c7ade62e4ef0 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs) smp_wmb(); atomic_set(&cpu_in_kgdb[cpu], 1); + /* Disable any cpu specific hw breakpoints */ + kgdb_disable_hw_debug(regs); + /* Wait till primary CPU is done with debugging */ while (atomic_read(&passive_cpu_wait[cpu])) cpu_relax(); -- cgit v1.2.1 From 5352ae638e2d7d5c9b2e4d528676bbf2af6fd6f3 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:43 -0600 Subject: perf, hw_breakpoint, kgdb: Do not take mutex for kernel debugger This patch fixes the regression in functionality where the kernel debugger and the perf API do not nicely share hw breakpoint reservations. The kernel debugger cannot use any mutex_lock() calls because it can start the kernel running from an invalid context. A mutex free version of the reservation API needed to get created for the kernel debugger to safely update hw breakpoint reservations. The possibility for a breakpoint reservation to be concurrently processed at the time that kgdb interrupts the system is improbable. Should this corner case occur the end user is warned, and the kernel debugger will prohibit updating the hardware breakpoint reservations. Any time the kernel debugger reserves a hardware breakpoint it will be a system wide reservation. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-3-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 52 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c030ae657f20..8a5c7d55ac9f 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ -int reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; - int ret = 0; - - mutex_lock(&nr_bp_mutex); fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) { - ret = -ENOSPC; - goto end; - } + if (slots.pinned + (!!slots.flexible) == HBP_NUM) + return -ENOSPC; toggle_bp_slot(bp, true); -end: + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + mutex_unlock(&nr_bp_mutex); return ret; } +static void __release_bp_slot(struct perf_event *bp) +{ + toggle_bp_slot(bp, false); +} + void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); - toggle_bp_slot(bp, false); + __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); } +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} int register_perf_hw_breakpoint(struct perf_event *bp) { -- cgit v1.2.1