From 426d31071ac476ea62c62656b242930c17b58c00 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 7 Aug 2010 12:30:03 +0200 Subject: fix printk typo 'faild' Signed-off-by: Paul Bolle Signed-off-by: Jiri Kosina --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f50299d..58716e73e2a2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -490,7 +490,7 @@ static int register_trace_probe(struct trace_probe *tp) } ret = register_probe_event(tp); if (ret) { - pr_warning("Faild to register probe event(%d)\n", ret); + pr_warning("Failed to register probe event(%d)\n", ret); goto end; } -- cgit v1.2.1 From 401a8d048eadfbe1b1c1bf53d3b614fcc894c61a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Sep 2010 10:36:00 +0200 Subject: workqueue: cleanup flush/cancel functions Make the following cleanup changes. * Relocate flush/cancel function prototypes and definitions. * Relocate wait_on_cpu_work() and wait_on_work() before try_to_grab_pending(). These will be used to implement flush_work_sync(). * Make all flush/cancel functions return bool instead of int. * Update wait_on_cpu_work() and wait_on_work() to return %true if they actually waited. * Add / update comments. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 175 ++++++++++++++++++++++++++++------------------------- 1 file changed, 94 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f77afd939229..1240b9d94b03 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2327,16 +2327,24 @@ out_unlock: EXPORT_SYMBOL_GPL(flush_workqueue); /** - * flush_work - block until a work_struct's callback has terminated - * @work: the work which is to be flushed + * flush_work - wait for a work to finish executing the last queueing instance + * @work: the work to flush * - * Returns false if @work has already terminated. + * Wait until @work has finished execution. This function considers + * only the last queueing instance of @work. If @work has been + * enqueued across different CPUs on a non-reentrant workqueue or on + * multiple workqueues, @work might still be executing on return on + * some of the CPUs from earlier queueing. * - * It is expected that, prior to calling flush_work(), the caller has - * arranged for the work to not be requeued, otherwise it doesn't make - * sense to use this function. + * If @work was queued only on a non-reentrant, ordered or unbound + * workqueue, @work is guaranteed to be idle on return if it hasn't + * been requeued since flush started. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. */ -int flush_work(struct work_struct *work) +bool flush_work(struct work_struct *work) { struct worker *worker = NULL; struct global_cwq *gcwq; @@ -2374,13 +2382,49 @@ int flush_work(struct work_struct *work) wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); - return 1; + return true; already_gone: spin_unlock_irq(&gcwq->lock); - return 0; + return false; } EXPORT_SYMBOL_GPL(flush_work); +static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) +{ + struct wq_barrier barr; + struct worker *worker; + + spin_lock_irq(&gcwq->lock); + + worker = find_worker_executing_work(gcwq, work); + if (unlikely(worker)) + insert_wq_barrier(worker->current_cwq, &barr, work, worker); + + spin_unlock_irq(&gcwq->lock); + + if (unlikely(worker)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else + return false; +} + +static bool wait_on_work(struct work_struct *work) +{ + bool ret = false; + int cpu; + + might_sleep(); + + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + + for_each_gcwq_cpu(cpu) + ret |= wait_on_cpu_work(get_gcwq(cpu), work); + return ret; +} + /* * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, * so this work can't be re-armed in any way. @@ -2423,39 +2467,7 @@ static int try_to_grab_pending(struct work_struct *work) return ret; } -static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -{ - struct wq_barrier barr; - struct worker *worker; - - spin_lock_irq(&gcwq->lock); - - worker = find_worker_executing_work(gcwq, work); - if (unlikely(worker)) - insert_wq_barrier(worker->current_cwq, &barr, work, worker); - - spin_unlock_irq(&gcwq->lock); - - if (unlikely(worker)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - } -} - -static void wait_on_work(struct work_struct *work) -{ - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - for_each_gcwq_cpu(cpu) - wait_on_cpu_work(get_gcwq(cpu), work); -} - -static int __cancel_work_timer(struct work_struct *work, +static bool __cancel_work_timer(struct work_struct *work, struct timer_list* timer) { int ret; @@ -2472,42 +2484,60 @@ static int __cancel_work_timer(struct work_struct *work, } /** - * cancel_work_sync - block until a work_struct's callback has terminated - * @work: the work which is to be flushed - * - * Returns true if @work was pending. + * cancel_work_sync - cancel a work and wait for it to finish + * @work: the work to cancel * - * cancel_work_sync() will cancel the work if it is queued. If the work's - * callback appears to be running, cancel_work_sync() will block until it - * has completed. + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself or migrates to + * another workqueue. On return from this function, @work is + * guaranteed to be not pending or executing on any CPU. * - * It is possible to use this function if the work re-queues itself. It can - * cancel the work even if it migrates to another workqueue, however in that - * case it only guarantees that work->func() has completed on the last queued - * workqueue. - * - * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not - * pending, otherwise it goes into a busy-wait loop until the timer expires. + * cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use cancel_delayed_work_sync() instead. * - * The caller must ensure that workqueue_struct on which this work was last + * The caller must ensure that the workqueue on which @work was last * queued can't be destroyed before this function returns. + * + * RETURNS: + * %true if @work was pending, %false otherwise. */ -int cancel_work_sync(struct work_struct *work) +bool cancel_work_sync(struct work_struct *work) { return __cancel_work_timer(work, NULL); } EXPORT_SYMBOL_GPL(cancel_work_sync); /** - * cancel_delayed_work_sync - reliably kill off a delayed work. - * @dwork: the delayed work struct + * flush_delayed_work - wait for a dwork to finish executing the last queueing + * @dwork: the delayed work to flush + * + * Delayed timer is cancelled and the pending work is queued for + * immediate execution. Like flush_work(), this function only + * considers the last queueing instance of @dwork. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) + __queue_work(raw_smp_processor_id(), + get_work_cwq(&dwork->work)->wq, &dwork->work); + return flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** + * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish + * @dwork: the delayed work cancel * - * Returns true if @dwork was pending. + * This is cancel_work_sync() for delayed works. * - * It is possible to use this function if @dwork rearms itself via queue_work() - * or queue_delayed_work(). See also the comment for cancel_work_sync(). + * RETURNS: + * %true if @dwork was pending, %false otherwise. */ -int cancel_delayed_work_sync(struct delayed_work *dwork) +bool cancel_delayed_work_sync(struct delayed_work *dwork) { return __cancel_work_timer(&dwork->work, &dwork->timer); } @@ -2558,23 +2588,6 @@ int schedule_delayed_work(struct delayed_work *dwork, } EXPORT_SYMBOL(schedule_delayed_work); -/** - * flush_delayed_work - block until a dwork_struct's callback has terminated - * @dwork: the delayed work which is to be flushed - * - * Any timeout is cancelled, and any pending work is run immediately. - */ -void flush_delayed_work(struct delayed_work *dwork) -{ - if (del_timer_sync(&dwork->timer)) { - __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, - &dwork->work); - put_cpu(); - } - flush_work(&dwork->work); -} -EXPORT_SYMBOL(flush_delayed_work); - /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use -- cgit v1.2.1 From baf59022c37d43f202e62d5130e4bac5e825b426 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Sep 2010 10:42:16 +0200 Subject: workqueue: factor out start_flush_work() Factor out start_flush_work() from flush_work(). start_flush_work() has @wait_executing argument which controls whether the barrier is queued only if the work is pending or also if executing. As flush_work() needs to wait for execution too, it uses %true. This commit doesn't cause any behavior difference. start_flush_work() will be used to implement flush_work_sync(). Signed-off-by: Tejun Heo --- kernel/workqueue.c | 64 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1240b9d94b03..33d31d768706 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2326,35 +2326,17 @@ out_unlock: } EXPORT_SYMBOL_GPL(flush_workqueue); -/** - * flush_work - wait for a work to finish executing the last queueing instance - * @work: the work to flush - * - * Wait until @work has finished execution. This function considers - * only the last queueing instance of @work. If @work has been - * enqueued across different CPUs on a non-reentrant workqueue or on - * multiple workqueues, @work might still be executing on return on - * some of the CPUs from earlier queueing. - * - * If @work was queued only on a non-reentrant, ordered or unbound - * workqueue, @work is guaranteed to be idle on return if it hasn't - * been requeued since flush started. - * - * RETURNS: - * %true if flush_work() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work(struct work_struct *work) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, + bool wait_executing) { struct worker *worker = NULL; struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; - struct wq_barrier barr; might_sleep(); gcwq = get_work_gcwq(work); if (!gcwq) - return 0; + return false; spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { @@ -2367,26 +2349,54 @@ bool flush_work(struct work_struct *work) cwq = get_work_cwq(work); if (unlikely(!cwq || gcwq != cwq->gcwq)) goto already_gone; - } else { + } else if (wait_executing) { worker = find_worker_executing_work(gcwq, work); if (!worker) goto already_gone; cwq = worker->current_cwq; - } + } else + goto already_gone; - insert_wq_barrier(cwq, &barr, work, worker); + insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); lock_map_acquire(&cwq->wq->lockdep_map); lock_map_release(&cwq->wq->lockdep_map); - - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); return true; already_gone: spin_unlock_irq(&gcwq->lock); return false; } + +/** + * flush_work - wait for a work to finish executing the last queueing instance + * @work: the work to flush + * + * Wait until @work has finished execution. This function considers + * only the last queueing instance of @work. If @work has been + * enqueued across different CPUs on a non-reentrant workqueue or on + * multiple workqueues, @work might still be executing on return on + * some of the CPUs from earlier queueing. + * + * If @work was queued only on a non-reentrant, ordered or unbound + * workqueue, @work is guaranteed to be idle on return if it hasn't + * been requeued since flush started. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_work(struct work_struct *work) +{ + struct wq_barrier barr; + + if (start_flush_work(work, &barr, true)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else + return false; +} EXPORT_SYMBOL_GPL(flush_work); static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -- cgit v1.2.1 From 09383498c5d35262e643bfdbae84826177a3c624 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Sep 2010 10:48:29 +0200 Subject: workqueue: implement flush[_delayed]_work_sync() Implement flush[_delayed]_work_sync(). These are flush functions which also make sure no CPU is still executing the target work from earlier queueing instances. These are similar to cancel[_delayed]_work_sync() except that the target work item is flushed instead of cancelled. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33d31d768706..19e4bc15ee99 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2435,6 +2435,41 @@ static bool wait_on_work(struct work_struct *work) return ret; } +/** + * flush_work_sync - wait until a work has finished execution + * @work: the work to flush + * + * Wait until @work has finished execution. On return, it's + * guaranteed that all queueing instances of @work which happened + * before this function is called are finished. In other words, if + * @work hasn't been requeued since this function was called, @work is + * guaranteed to be idle on return. + * + * RETURNS: + * %true if flush_work_sync() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_work_sync(struct work_struct *work) +{ + struct wq_barrier barr; + bool pending, waited; + + /* we'll wait for executions separately, queue barr only if pending */ + pending = start_flush_work(work, &barr, false); + + /* wait for executions to finish */ + waited = wait_on_work(work); + + /* wait for the pending one */ + if (pending) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } + + return pending || waited; +} +EXPORT_SYMBOL_GPL(flush_work_sync); + /* * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, * so this work can't be re-armed in any way. @@ -2538,6 +2573,27 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +/** + * flush_delayed_work_sync - wait for a dwork to finish + * @dwork: the delayed work to flush + * + * Delayed timer is cancelled and the pending work is queued for + * execution immediately. Other than timer handling, its behavior + * is identical to flush_work_sync(). + * + * RETURNS: + * %true if flush_work_sync() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_delayed_work_sync(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) + __queue_work(raw_smp_processor_id(), + get_work_cwq(&dwork->work)->wq, &dwork->work); + return flush_work_sync(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work_sync); + /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish * @dwork: the delayed work cancel -- cgit v1.2.1 From 676cb02dc32adef13d9efb5ea52079e4ede1e3ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Jul 2009 23:33:49 +0200 Subject: softirqs: Make wakeup_softirqd static No users outside of kernel/softirq.c Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73a..80f6e3bd1d2a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -void wakeup_softirqd(void) +static void wakeup_softirqd(void) { /* Interrupts are disabled: no need to stop preemption */ struct task_struct *tsk = __get_cpu_var(ksoftirqd); -- cgit v1.2.1 From 99a51792dbb7e6e39b2a4ebcfe202f1dcc7354c4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 4 Sep 2010 18:52:50 -0700 Subject: kernel/pm_qos_params.c: Remove unnecessary casts of private_data Signed-off-by: Joe Perches Signed-off-by: Jiri Kosina --- kernel/pm_qos_params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 996a4dec5f96..49829c5b4b47 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -394,7 +394,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, } else return -EINVAL; - pm_qos_req = (struct pm_qos_request_list *)filp->private_data; + pm_qos_req = filp->private_data; pm_qos_update_request(pm_qos_req, value); return count; -- cgit v1.2.1 From db71922217a214e5c9268448e537b54fc1f301ea Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Sun, 15 Aug 2010 22:51:10 +0200 Subject: BKL: Explicitly add BKL around get_sb/fill_super This patch is a preparation necessary to remove the BKL from do_new_mount(). It explicitly adds calls to lock_kernel()/unlock_kernel() around get_sb/fill_super operations for filesystems that still uses the BKL. I've read through all the code formerly covered by the BKL inside do_kern_mount() and have satisfied myself that it doesn't need the BKL any more. do_kern_mount() is already called without the BKL when mounting the rootfs and in nfsctl. do_kern_mount() calls vfs_kern_mount(), which is called from various places without BKL: simple_pin_fs(), nfs_do_clone_mount() through nfs_follow_mountpoint(), afs_mntpt_do_automount() through afs_mntpt_follow_link(). Both later functions are actually the filesystems follow_link inode operation. vfs_kern_mount() is calling the specified get_sb function and lets the filesystem do its job by calling the given fill_super function. Therefore I think it is safe to push down the BKL from the VFS to the low-level filesystems get_sb/fill_super operation. [arnd: do not add the BKL to those file systems that already don't use it elsewhere] Signed-off-by: Jan Blunck Signed-off-by: Arnd Bergmann Cc: Matthew Wilcox Cc: Christoph Hellwig --- kernel/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f6140..a7ba3bccadc5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1430,6 +1430,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *new_root; + lock_kernel(); + /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); @@ -1559,6 +1561,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); + unlock_kernel(); return 0; drop_new_super: @@ -1568,6 +1571,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); + unlock_kernel(); return ret; } -- cgit v1.2.1 From 38d018dba3f725b969f196550d92a6ec1c092428 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Wed, 24 Feb 2010 13:25:34 +0100 Subject: BKL: Remove BKL from cgroup The BKL is only used in remount_fs and get_sb that are both protected by the superblocks s_umount rw_semaphore. Therefore it is safe to remove the BKL entirely. Signed-off-by: Jan Blunck Signed-off-by: Arnd Bergmann --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7ba3bccadc5..304d27759949 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include /* TODO: replace with more sophisticated array */ @@ -1222,7 +1221,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; - lock_kernel(); mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -1255,7 +1253,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - unlock_kernel(); return ret; } @@ -1430,8 +1427,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *new_root; - lock_kernel(); - /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); @@ -1561,7 +1556,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); - unlock_kernel(); return 0; drop_new_super: @@ -1571,8 +1565,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); - unlock_kernel(); - return ret; } -- cgit v1.2.1 From 97bd234701b2b39a0e749c1fe0e44f1d14c94292 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Oct 2010 10:41:14 +0200 Subject: workqueue: prepare for more tracepoints Define workqueue_work event class and use it for workqueue_execute_end trace point. Also, move trace/events/workqueue.h include downwards such that all struct definitions are visible to it. This is to prepare for more tracepoints and doesn't cause any functional change. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 19e4bc15ee99..026f778e879b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -42,9 +42,6 @@ #include #include -#define CREATE_TRACE_POINTS -#include - #include "workqueue_sched.h" enum { @@ -257,6 +254,9 @@ EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); +#define CREATE_TRACE_POINTS +#include + #define for_each_busy_worker(worker, i, pos, gcwq) \ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) -- cgit v1.2.1 From cdadf0097cdca06c497ffaeb5982e028c6e4ed38 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Oct 2010 10:49:55 +0200 Subject: workqueue: add queue_work and activate_work trace points These two tracepoints allow tracking when and how a work is queued and activated. This patch is based on Frederic's patch to add queue_work trace point. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 026f778e879b..cb2ccfbed0c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -997,6 +997,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, /* gcwq determined, get cwq and queue */ cwq = get_cwq(gcwq->cpu, wq); + trace_workqueue_queue_work(cpu, cwq, work); BUG_ON(!list_empty(&work->entry)); @@ -1004,6 +1005,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, work_flags = work_color_to_flags(cwq->work_color); if (likely(cwq->nr_active < cwq->max_active)) { + trace_workqueue_activate_work(work); cwq->nr_active++; worklist = gcwq_determine_ins_pos(gcwq, cwq); } else { @@ -1679,6 +1681,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) struct work_struct, entry); struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + trace_workqueue_activate_work(work); move_linked_works(work, pos, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; -- cgit v1.2.1 From 30310045dd20a286cf3800f063f79b468e132fb1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 Oct 2010 11:51:57 +0200 Subject: workqueue: fix HIGHPRI handling in keep_working() The policy function keep_working() didn't check GCWQ_HIGHPRI_PENDING and could return %false with highpri work pending. This could lead to late execution of a highpri work which was delayed due to @max_active throttling if other works are actively consuming CPU cycles. For example, the following could happen. 1. Work W0 which burns CPU cycles. 2. Two works W1 and W2 are queued to a highpri wq w/ @max_active of 1. 3. W1 starts executing and W2 is put to delayed queue. W0 and W1 are both runnable. 4. W1 finishes which puts W2 to pending queue but keep_working() incorrectly returns %false and the worker goes to sleep. 5. W0 finishes and W2 starts execution. With this patch applied, W2 starts execution as soon as W1 finishes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cb2ccfbed0c6..b57a8babdec3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -604,7 +604,9 @@ static bool keep_working(struct global_cwq *gcwq) { atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); - return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; + return !list_empty(&gcwq->worklist) && + (atomic_read(nr_running) <= 1 || + gcwq->flags & GCWQ_HIGHPRI_PENDING); } /* Do we need a new worker? Called from manager. */ -- cgit v1.2.1 From 6370a6ad3b53df90b4700977f7718118a2cd524a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 Oct 2010 15:12:27 +0200 Subject: workqueue: add and use WQ_MEM_RECLAIM flag Add WQ_MEM_RECLAIM flag which currently maps to WQ_RESCUER, mark WQ_RESCUER as internal and replace all external WQ_RESCUER usages to WQ_MEM_RECLAIM. This makes the API users express the intent of the workqueue instead of indicating the internal mechanism used to guarantee forward progress. This is also to make it cleaner to add more semantics to WQ_MEM_RECLAIM. For example, if deemed necessary, memory reclaim workqueues can be made highpri. This patch doesn't introduce any functional change. Signed-off-by: Tejun Heo Cc: Jeff Garzik Cc: Dave Chinner Cc: Steven Whitehouse --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b57a8babdec3..2c6871cbcbee 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2847,6 +2847,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, struct workqueue_struct *wq; unsigned int cpu; + /* + * Workqueues which may be used during memory reclaim should + * have a rescuer to guarantee forward progress. + */ + if (flags & WQ_MEM_RECLAIM) + flags |= WQ_RESCUER; + /* * Unbound workqueues aren't concurrency managed and should be * dispatched to workers immediately. -- cgit v1.2.1 From 6038f373a3dc1f1c26496e60b6c40b164716f07e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 15 Aug 2010 18:52:59 +0200 Subject: llseek: automatically add .llseek fop All file_operations should get a .llseek operation so we can make nonseekable_open the default for future file operations without a .llseek pointer. The three cases that we can automatically detect are no_llseek, seq_lseek and default_llseek. For cases where we can we can automatically prove that the file offset is always ignored, we use noop_llseek, which maintains the current behavior of not returning an error from a seek. New drivers should normally not use noop_llseek but instead use no_llseek and call nonseekable_open at open time. Existing drivers can be converted to do the same when the maintainer knows for certain that no user code relies on calling seek on the device file. The generated code is often incorrectly indented and right now contains comments that clarify for each added line why a specific variant was chosen. In the version that gets submitted upstream, the comments will be gone and I will manually fix the indentation, because there does not seem to be a way to do that using coccinelle. Some amount of new code is currently sitting in linux-next that should get the same modifications, which I will do at the end of the merge window. Many thanks to Julia Lawall for helping me learn to write a semantic patch that does all this. ===== begin semantic patch ===== // This adds an llseek= method to all file operations, // as a preparation for making no_llseek the default. // // The rules are // - use no_llseek explicitly if we do nonseekable_open // - use seq_lseek for sequential files // - use default_llseek if we know we access f_pos // - use noop_llseek if we know we don't access f_pos, // but we still want to allow users to call lseek // @ open1 exists @ identifier nested_open; @@ nested_open(...) { <+... nonseekable_open(...) ...+> } @ open exists@ identifier open_f; identifier i, f; identifier open1.nested_open; @@ int open_f(struct inode *i, struct file *f) { <+... ( nonseekable_open(...) | nested_open(...) ) ...+> } @ read disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ read_no_fpos disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { ... when != off } @ write @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ write_no_fpos @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { ... when != off } @ fops0 @ identifier fops; @@ struct file_operations fops = { ... }; @ has_llseek depends on fops0 @ identifier fops0.fops; identifier llseek_f; @@ struct file_operations fops = { ... .llseek = llseek_f, ... }; @ has_read depends on fops0 @ identifier fops0.fops; identifier read_f; @@ struct file_operations fops = { ... .read = read_f, ... }; @ has_write depends on fops0 @ identifier fops0.fops; identifier write_f; @@ struct file_operations fops = { ... .write = write_f, ... }; @ has_open depends on fops0 @ identifier fops0.fops; identifier open_f; @@ struct file_operations fops = { ... .open = open_f, ... }; // use no_llseek if we call nonseekable_open //////////////////////////////////////////// @ nonseekable1 depends on !has_llseek && has_open @ identifier fops0.fops; identifier nso ~= "nonseekable_open"; @@ struct file_operations fops = { ... .open = nso, ... +.llseek = no_llseek, /* nonseekable */ }; @ nonseekable2 depends on !has_llseek @ identifier fops0.fops; identifier open.open_f; @@ struct file_operations fops = { ... .open = open_f, ... +.llseek = no_llseek, /* open uses nonseekable */ }; // use seq_lseek for sequential files ///////////////////////////////////// @ seq depends on !has_llseek @ identifier fops0.fops; identifier sr ~= "seq_read"; @@ struct file_operations fops = { ... .read = sr, ... +.llseek = seq_lseek, /* we have seq_read */ }; // use default_llseek if there is a readdir /////////////////////////////////////////// @ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier readdir_e; @@ // any other fop is used that changes pos struct file_operations fops = { ... .readdir = readdir_e, ... +.llseek = default_llseek, /* readdir is present */ }; // use default_llseek if at least one of read/write touches f_pos ///////////////////////////////////////////////////////////////// @ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read.read_f; @@ // read fops use offset struct file_operations fops = { ... .read = read_f, ... +.llseek = default_llseek, /* read accesses f_pos */ }; @ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, ... + .llseek = default_llseek, /* write accesses f_pos */ }; // Use noop_llseek if neither read nor write accesses f_pos /////////////////////////////////////////////////////////// @ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; identifier write_no_fpos.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, .read = read_f, ... +.llseek = noop_llseek, /* read and write both use no f_pos */ }; @ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write_no_fpos.write_f; @@ struct file_operations fops = { ... .write = write_f, ... +.llseek = noop_llseek, /* write uses no f_pos */ }; @ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; @@ struct file_operations fops = { ... .read = read_f, ... +.llseek = noop_llseek, /* read uses no f_pos */ }; @ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; @@ struct file_operations fops = { ... +.llseek = noop_llseek, /* no read or write fn */ }; ===== End semantic patch ===== Signed-off-by: Arnd Bergmann Cc: Julia Lawall Cc: Christoph Hellwig --- kernel/configs.c | 1 + kernel/gcov/fs.c | 1 + kernel/kprobes.c | 1 + kernel/pm_qos_params.c | 1 + kernel/profile.c | 1 + kernel/trace/blktrace.c | 2 ++ kernel/trace/ftrace.c | 2 ++ kernel/trace/ring_buffer.c | 1 + kernel/trace/trace_events.c | 6 ++++++ kernel/trace/trace_stack.c | 1 + 10 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index abaee684ecbf..b4066b44a99d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf, static const struct file_operations ikconfig_file_ops = { .owner = THIS_MODULE, .read = ikconfig_read_current, + .llseek = default_llseek, }; static int __init ikconfig_init(void) diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index f83972b16564..9bd0934f6c33 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len, static const struct file_operations gcov_reset_fops = { .write = reset_write, .read = reset_read, + .llseek = noop_llseek, }; /* diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae96..8b5ff2655ae0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1992,6 +1992,7 @@ static ssize_t write_enabled_file_bool(struct file *file, static const struct file_operations fops_kp = { .read = read_enabled_file_bool, .write = write_enabled_file_bool, + .llseek = default_llseek, }; static int __kprobes debugfs_kprobe_init(void) diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 645e541a45f6..a96b850ba08a 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -110,6 +110,7 @@ static const struct file_operations pm_qos_power_fops = { .write = pm_qos_power_write, .open = pm_qos_power_open, .release = pm_qos_power_release, + .llseek = noop_llseek, }; /* unlocked internal variant */ diff --git a/kernel/profile.c b/kernel/profile.c index b22a899934cc..66f841b7fbd3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, static const struct file_operations proc_profile_operations = { .read = read_profile, .write = write_profile, + .llseek = default_llseek, }; #ifdef CONFIG_SMP diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 959f8d6c8cc1..2d5f3a757316 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -326,6 +326,7 @@ static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, .open = blk_dropped_open, .read = blk_dropped_read, + .llseek = default_llseek, }; static int blk_msg_open(struct inode *inode, struct file *filp) @@ -365,6 +366,7 @@ static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, .open = blk_msg_open, .write = blk_msg_write, + .llseek = noop_llseek, }; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa7ece649fe1..5e1ad4763090 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -800,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = { .open = tracing_open_generic, .read = ftrace_profile_read, .write = ftrace_profile_write, + .llseek = default_llseek, }; /* used to initialize the real stat files */ @@ -2632,6 +2633,7 @@ static const struct file_operations ftrace_graph_fops = { .read = seq_read, .write = ftrace_graph_write, .release = ftrace_graph_release, + .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 492197e2f86c..3aea966d16de 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3965,6 +3965,7 @@ static const struct file_operations rb_simple_fops = { .open = tracing_open_generic, .read = rb_simple_read, .write = rb_simple_write, + .llseek = default_llseek, }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..0369c5e09984 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -951,6 +951,7 @@ static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, .write = event_enable_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_event_format_fops = { @@ -963,29 +964,34 @@ static const struct file_operations ftrace_event_format_fops = { static const struct file_operations ftrace_event_id_fops = { .open = tracing_open_generic, .read = event_id_read, + .llseek = default_llseek, }; static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_generic, .read = event_filter_read, .write = event_filter_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_subsystem_filter_fops = { .open = tracing_open_generic, .read = subsystem_filter_read, .write = subsystem_filter_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_system_enable_fops = { .open = tracing_open_generic, .read = system_enable_read, .write = system_enable_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, + .llseek = default_llseek, }; static struct dentry *event_trace_events_dir(void) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a6b7e0e0f3eb..4c5dead0c239 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = { .open = tracing_open_generic, .read = stack_max_size_read, .write = stack_max_size_write, + .llseek = default_llseek, }; static void * -- cgit v1.2.1 From 31ddd871fc3db73e2024cb3eb3ee5051edf5a80f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Oct 2010 11:14:49 +0200 Subject: workqueue: Clarify that schedule_on_each_cpu is synchronous The documentation for schedule_on_each_cpu() states that it calls a function on each online CPU from keventd. This can easily be interpreted as an asyncronous call because the description does not mention that flush_work is called. Clarify that it is synchronous. tj: rephrased a bit Signed-off-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2c6871cbcbee..eb5c1972443a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2676,13 +2676,15 @@ int schedule_delayed_work_on(int cpu, EXPORT_SYMBOL(schedule_delayed_work_on); /** - * schedule_on_each_cpu - call a function on each online CPU from keventd + * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * - * Returns zero on success. - * Returns -ve errno on failure. - * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. */ int schedule_on_each_cpu(work_func_t func) { -- cgit v1.2.1 From daaae6b010ac0f60c9c35e481589966f9f1fcc22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Oct 2010 11:28:15 +0200 Subject: workqueue: remove in_workqueue_context() Commit a25909a4 (lockdep: Add an in_workqueue_context() lockdep-based test function) added in_workqueue_context() but there hasn't been any in-kernel user and the lockdep annotation in workqueue is scheduled to change. Remove the unused function. Signed-off-by: Tejun Heo Cc: Paul E. McKenney --- kernel/workqueue.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eb5c1972443a..30acdb74cc23 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -310,21 +310,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, (cpu) < WORK_CPU_NONE; \ (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) -#ifdef CONFIG_LOCKDEP -/** - * in_workqueue_context() - in context of specified workqueue? - * @wq: the workqueue of interest - * - * Checks lockdep state to see if the current task is executing from - * within a workqueue item. This function exists only if lockdep is - * enabled. - */ -int in_workqueue_context(struct workqueue_struct *wq) -{ - return lock_is_held(&wq->lockdep_map); -} -#endif - #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; -- cgit v1.2.1 From 0fc86c7bd924debd0bddee790ecc884604fdcc63 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 11 Sep 2010 20:11:08 +0200 Subject: rtmutex-tester: make it build without BKL The big kernel lock is going away, so make sure that if it is disabled by Kconfig, we do not try to validate it, which would result in compile errors. Signed-off-by: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arjan van de Ven Cc: Andrew Morton --- kernel/rtmutex-tester.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index a56f629b057a..66cb89bc5ef1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) } if (!lockwakeup && td->bkl == 4) { +#ifdef CONFIG_LOCK_KERNEL unlock_kernel(); +#endif td->bkl = 0; } return 0; @@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) if (td->bkl) return 0; td->bkl = 1; +#ifdef CONFIG_LOCK_KERNEL lock_kernel(); +#endif td->bkl = 4; return 0; case RTTEST_UNLOCKBKL: if (td->bkl != 4) break; +#ifdef CONFIG_LOCK_KERNEL unlock_kernel(); +#endif td->bkl = 0; return 0; -- cgit v1.2.1 From 01b284f9b6d51cc3f3bcf3b49f16d2601d3ca22d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Sep 2010 20:39:22 +0200 Subject: blktrace: remove the big kernel lock According to Jens, this code does not need the BKL at all, it is sufficiently serialized by bd_mutex. Signed-off-by: Arnd Bergmann Cc: Jens Axboe Cc: Steven Rostedt --- kernel/trace/blktrace.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 959f8d6c8cc1..5328e8779d4d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -639,7 +638,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - lock_kernel(); mutex_lock(&bdev->bd_mutex); switch (cmd) { @@ -667,7 +665,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) } mutex_unlock(&bdev->bd_mutex); - unlock_kernel(); return ret; } @@ -1652,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct block_device *bdev; ssize_t ret = -ENXIO; - lock_kernel(); bdev = bdget(part_devt(p)); if (bdev == NULL) - goto out_unlock_kernel; + goto out; q = blk_trace_get_queue(bdev); if (q == NULL) @@ -1683,8 +1679,7 @@ out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); -out_unlock_kernel: - unlock_kernel(); +out: return ret; } @@ -1714,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = -ENXIO; - lock_kernel(); p = dev_to_part(dev); bdev = bdget(part_devt(p)); if (bdev == NULL) - goto out_unlock_kernel; + goto out; q = blk_trace_get_queue(bdev); if (q == NULL) @@ -1753,8 +1747,6 @@ out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); -out_unlock_kernel: - unlock_kernel(); out: return ret ? ret : count; } -- cgit v1.2.1 From 747e94ae3d1b4c9bf5380e569f614eb9040b79e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 Oct 2010 13:51:48 -0400 Subject: ring-buffer: Make write slow path out of line Gcc inlines the slow path of the ring buffer write which can hurt performance. This patch simply forces the slow path function rb_move_tail() to always be a function. The ring_buffer_benchmark module with reader_disabled=1 shows that this patch changes the time to record an event from 135 ns to 132 ns. (3 ns or 2.22% improvement) Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bca96377fd4e..0b88df849a59 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1823,7 +1823,10 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } -static struct ring_buffer_event * +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, struct buffer_page *tail_page, u64 *ts) @@ -1943,7 +1946,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) + if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, tail_page, ts); -- cgit v1.2.1 From e8bc43e84fada397af1b677b07dbf26e6ac78fcc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Oct 2010 10:58:02 -0400 Subject: ring-buffer: Pass timestamp by value and not by reference The original code for the ring buffer had locations that modified the timestamp and that change was used by the callers. Now, the timestamp is not reused by the callers and there is no reason to pass it by reference. By changing the call to pass by value, lets gcc optimize the code a bit more where it can store the timestamp in a register and not worry about updating the reference. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0b88df849a59..c8ce6bde7fa4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1829,7 +1829,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 *ts) + struct buffer_page *tail_page, u64 ts) { struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; @@ -1912,8 +1912,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer); - next_page->page->time_stamp = *ts; + ts = rb_time_stamp(buffer); + next_page->page->time_stamp = ts; } out_again: @@ -1932,7 +1932,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) + unsigned type, unsigned long length, u64 ts) { struct buffer_page *tail_page; struct ring_buffer_event *event; @@ -1965,7 +1965,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * its timestamp. */ if (!tail) - tail_page->page->time_stamp = *ts; + tail_page->page->time_stamp = ts; return event; } @@ -2008,7 +2008,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 *ts, u64 *delta) + u64 ts, u64 *delta) { struct ring_buffer_event *event; int ret; @@ -2016,7 +2016,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, WARN_ONCE(*delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", (unsigned long long)*delta, - (unsigned long long)*ts, + (unsigned long long)ts, (unsigned long long)cpu_buffer->write_stamp); /* @@ -2051,7 +2051,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->array[0] = 0; } } - cpu_buffer->write_stamp = *ts; + cpu_buffer->write_stamp = ts; /* let the caller know this was the commit */ ret = 1; } else { @@ -2175,7 +2175,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); + commit = rb_add_time_stamp(cpu_buffer, ts, &delta); if (commit == -EBUSY) goto out_fail; @@ -2187,7 +2187,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, } get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, &ts); + event = __rb_reserve_next(cpu_buffer, 0, length, ts); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; -- cgit v1.2.1 From f25106aeab7408394b9dd707e5ecf557e269c723 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Oct 2010 12:40:12 -0400 Subject: ring-buffer: Pass delta by value and not by reference The delta between events is passed to the timestamp code by reference and the timestamp code will reset the value. But it can be reset from the caller. No need to pass it in by reference. By changing the call to pass by value, lets gcc optimize the code a bit more where it can store the delta in a register and not worry about updating the reference. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c8ce6bde7fa4..3af77cd47f21 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2008,14 +2008,14 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 ts, u64 *delta) + u64 ts, u64 delta) { struct ring_buffer_event *event; int ret; - WARN_ONCE(*delta > (1ULL << 59), + WARN_ONCE(delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", - (unsigned long long)*delta, + (unsigned long long)delta, (unsigned long long)ts, (unsigned long long)cpu_buffer->write_stamp); @@ -2041,8 +2041,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, * and if we can't just make it zero. */ if (rb_event_index(event)) { - event->time_delta = *delta & TS_MASK; - event->array[0] = *delta >> TS_SHIFT; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; } else { /* try to discard, since we do not need this */ if (!rb_try_to_discard(cpu_buffer, event)) { @@ -2064,8 +2064,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, ret = 0; } - *delta = 0; - return ret; } @@ -2175,7 +2173,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - commit = rb_add_time_stamp(cpu_buffer, ts, &delta); + commit = rb_add_time_stamp(cpu_buffer, ts, delta); + delta = 0; + if (commit == -EBUSY) goto out_fail; -- cgit v1.2.1 From 69d1b839f7eee347e357b3f6cce7f630cc6ff93d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Oct 2010 18:18:05 -0400 Subject: ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 266 ++++++++++++++++++++++++--------------------- 1 file changed, 142 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3af77cd47f21..f50f43107e93 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -224,6 +224,9 @@ enum { RB_LEN_TIME_STAMP = 16, }; +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event) return length + RB_EVNT_HDR_SIZE; } -/* inline for ring buffer fast paths */ -static unsigned +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { @@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event) return 0; } +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - unsigned length = rb_event_length(event); + unsigned length; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; @@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) @@ -1546,6 +1583,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + /** * ring_buffer_update_event - update event type and data * @event: the even to update @@ -1558,28 +1614,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * data field. */ static void -rb_update_event(struct ring_buffer_event *event, - unsigned type, unsigned length) +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, unsigned length, + int add_timestamp, u64 delta) { - event->type_len = type; - - switch (type) { - - case RINGBUF_TYPE_PADDING: - case RINGBUF_TYPE_TIME_EXTEND: - case RINGBUF_TYPE_TIME_STAMP: - break; + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; - case 0: - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - event->array[0] = length; - else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); - break; - default: - BUG(); + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } /* @@ -1932,12 +1991,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 ts) + unsigned long length, u64 ts, + u64 delta, int add_timestamp) { struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(add_timestamp)) + length += RB_LEN_TIME_EXTEND; + tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1954,11 +2022,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(event, type, length); + rb_update_event(cpu_buffer, event, length, add_timestamp, delta); - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); + local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update @@ -1980,7 +2046,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long addr; new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); + old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= PAGE_MASK; @@ -2006,67 +2072,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static int -rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 ts, u64 delta) -{ - struct ring_buffer_event *event; - int ret; - - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); - - /* - * The delta is too big, we to add a - * new timestamp. - */ - event = __rb_reserve_next(cpu_buffer, - RINGBUF_TYPE_TIME_EXTEND, - RB_LEN_TIME_EXTEND, - ts); - if (!event) - return -EBUSY; - - if (PTR_ERR(event) == -EAGAIN) - return -EAGAIN; - - /* Only a commited time event can update the write stamp */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * If this is the first on the page, then it was - * updated with the page itself. Try to discard it - * and if we can't just make it zero. - */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* try to discard, since we do not need this */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - } - cpu_buffer->write_stamp = ts; - /* let the caller know this was the commit */ - ret = 1; - } else { - /* Try to discard the event */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - } - ret = 0; - } - - return ret; -} - static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); @@ -2111,9 +2116,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta = 0; - int commit = 0; + u64 ts, delta; int nr_loops = 0; + int add_timestamp; rb_start_commit(cpu_buffer); @@ -2134,6 +2139,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, length = rb_calculate_event_length(length); again: + add_timestamp = 0; + delta = 0; + /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2172,33 +2180,24 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - - commit = rb_add_time_stamp(cpu_buffer, ts, delta); - delta = 0; - - if (commit == -EBUSY) - goto out_fail; - - if (commit == -EAGAIN) - goto again; - - RB_WARN_ON(cpu_buffer, commit < 0); + WARN_ONCE(delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + (unsigned long long)delta, + (unsigned long long)ts, + (unsigned long long)cpu_buffer->write_stamp); + add_timestamp = 1; } } get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, ts); + event = __rb_reserve_next(cpu_buffer, length, ts, + delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) goto out_fail; - if (!rb_event_is_commit(cpu_buffer, event)) - delta = 0; - - event->time_delta = delta; - return event; out_fail: @@ -2311,12 +2310,28 @@ static void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { + u64 delta; + /* * The event first in the commit queue updates the * time stamp. */ - if (rb_event_is_commit(cpu_buffer, event)) - cpu_buffer->write_stamp += event->time_delta; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, @@ -2356,6 +2371,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; @@ -3043,12 +3061,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written, or from discarded - * commits. The most that we can have is the number on a single page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -3124,14 +3142,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a timestamp is encountered. - * We can get multiple timestamps by nested interrupts or also - * if filtering is on (discarding commits). Since discarding - * commits can be frequent we can get a lot of timestamps. - * But we limit them by not adding timestamps if they begin - * at the start of a page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) @@ -3829,7 +3845,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (len > (commit - read)) len = (commit - read); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); if (len < size) goto out_unlock; @@ -3851,7 +3868,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, break; event = rb_reader_event(cpu_buffer); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); } while (len > size); /* update bpage */ -- cgit v1.2.1 From 140ff89127c74b1b1c1b0152a36ea3720ccf6bc3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 Oct 2010 10:50:30 -0400 Subject: ring-buffer: Remove condition to add timestamp in fast path There's a condition to check if we should add a time extend or not in the fast path. But this condition is racey (in the sense that we can add a unnecessary time extend, but nothing that can break anything). We later check if the time or event time delta should be zero or have real data in it (not racey), making this first check redundant. This check may help save space once in a while, but really is not worth the hassle to try to save some space that happens at most 134 ms at a time. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f50f43107e93..d9f3e7a82137 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2119,6 +2119,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, u64 ts, delta; int nr_loops = 0; int add_timestamp; + u64 diff; rb_start_commit(cpu_buffer); @@ -2155,29 +2156,13 @@ rb_reserve_next_event(struct ring_buffer *buffer, goto out_fail; ts = rb_time_stamp(cpu_buffer->buffer); + diff = ts - cpu_buffer->write_stamp; - /* - * Only the first commit can update the timestamp. - * Yes there is a race here. If an interrupt comes in - * just after the conditional and it traces too, then it - * will also check the deltas. More than one timestamp may - * also be made. But only the entry that did the actual - * commit will be something other than zero. - */ - if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer))) { - u64 diff; - - diff = ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (unlikely(ts < cpu_buffer->write_stamp)) - goto get_event; + /* make sure this diff is calculated here */ + barrier(); + /* Did the write stamp get updated already? */ + if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), @@ -2189,7 +2174,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, } } - get_event: event = __rb_reserve_next(cpu_buffer, length, ts, delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) -- cgit v1.2.1 From d9abde2138e0a00a0d7e44676928efa0ef629d48 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Oct 2010 13:17:08 -0400 Subject: ring-buffer: Micro-optimize with some strategic inlining By using inline and noinline, we are able to make the fast path of recording an event 4% faster. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9f3e7a82137..f5007d0d932d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2078,7 +2078,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } -static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2193,13 +2193,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, #define TRACE_RECURSIVE_DEPTH 16 -static int trace_recursive_lock(void) +/* Keep this code out of the fast path cache */ +static noinline void trace_recursive_fail(void) { - current->trace_recursion++; - - if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) - return 0; - /* Disable all tracing before we do anything else */ tracing_off_permanent(); @@ -2211,10 +2207,21 @@ static int trace_recursive_lock(void) in_nmi()); WARN_ON_ONCE(1); +} + +static inline int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + trace_recursive_fail(); + return -1; } -static void trace_recursive_unlock(void) +static inline void trace_recursive_unlock(void) { WARN_ON_ONCE(!current->trace_recursion); -- cgit v1.2.1 From b8b2663bd7c9da04ac804659b9f617c199d0252c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Oct 2010 13:23:25 -0400 Subject: ring-buffer: Remove unused macro RB_TIMESTAMPS_PER_PAGE With the binding of time extends to events we no longer need to use the macro RB_TIMESTAMPS_PER_PAGE. Remove it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f5007d0d932d..ad25490f8b40 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -441,9 +441,6 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -/* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) - int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; -- cgit v1.2.1 From 16cdc628c3aed47d02205135b7e2f01e0064f566 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 6 Aug 2010 11:47:14 -0500 Subject: debug_core: move all watch dog syncs to a single function Move the various clock and watch dog syncs to a single function in advance of adding another sync for the rcu stall detector. Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index de407c78178d..c812857d0b80 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -470,6 +470,12 @@ static void dbg_cpu_switch(int cpu, int next_cpu) kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER; } +static void dbg_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); +} + static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) { unsigned long flags; @@ -523,8 +529,7 @@ return_normal: if (trace_on) tracing_on(); atomic_dec(&cpu_in_kgdb[cpu]); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); + dbg_touch_watchdogs(); local_irq_restore(flags); return 0; } @@ -541,8 +546,7 @@ return_normal: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); + dbg_touch_watchdogs(); local_irq_restore(flags); goto acquirelock; @@ -659,8 +663,7 @@ kgdb_restore: tracing_on(); /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); + dbg_touch_watchdogs(); local_irq_restore(flags); return kgdb_info[cpu].ret_state; -- cgit v1.2.1 From fb70b5888b70b0b50f738fbfc019445493112eb1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 13 Aug 2010 12:44:04 -0500 Subject: debug_core: stop rcu warnings on kernel resume When returning from the kernel debugger reset the rcu jiffies_stall value to prevent the rcu stall detector from sending NMI events which invoke a stack dump for each cpu in the system. Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index c812857d0b80..5a3b04d20497 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -474,6 +475,7 @@ static void dbg_touch_watchdogs(void) { touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); } static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) -- cgit v1.2.1 From f7030bbc446430ecd12c9ad02cf0ea94934e5f91 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 11 Oct 2010 10:20:14 -0500 Subject: kdb: Allow kernel loadable modules to add kdb shell functions In order to allow kernel modules to dynamically add a command to the kdb shell the kdb_register, kdb_register_repeat, kdb_unregister, and kdb_printf need to be exported as GPL symbols. Any kernel module that adds a dynamic kdb shell function should only need to include linux/kdb.h. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 2 +- kernel/debug/kdb/kdb_main.c | 4 ++++ kernel/debug/kdb/kdb_private.h | 39 --------------------------------------- 3 files changed, 5 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index c9b7f4f90bba..96fdaac46a80 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...) return r; } - +EXPORT_SYMBOL_GPL(kdb_printf); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index caf057a3de0e..5448990a299e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2783,6 +2783,8 @@ int kdb_register_repeat(char *cmd, return 0; } +EXPORT_SYMBOL_GPL(kdb_register_repeat); + /* * kdb_register - Compatibility register function for commands that do @@ -2805,6 +2807,7 @@ int kdb_register(char *cmd, return kdb_register_repeat(cmd, func, usage, help, minlen, KDB_REPEAT_NONE); } +EXPORT_SYMBOL_GPL(kdb_register); /* * kdb_unregister - This function is used to unregister a kernel @@ -2833,6 +2836,7 @@ int kdb_unregister(char *cmd) /* Couldn't find it. */ return 1; } +EXPORT_SYMBOL_GPL(kdb_unregister); /* Initialize the kdb command table. */ static void __init kdb_inittab(void) diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index be775f7e81e0..1921e6e4c0bc 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -15,29 +15,6 @@ #include #include "../debug_core.h" -/* Kernel Debugger Error codes. Must not overlap with command codes. */ -#define KDB_NOTFOUND (-1) -#define KDB_ARGCOUNT (-2) -#define KDB_BADWIDTH (-3) -#define KDB_BADRADIX (-4) -#define KDB_NOTENV (-5) -#define KDB_NOENVVALUE (-6) -#define KDB_NOTIMP (-7) -#define KDB_ENVFULL (-8) -#define KDB_ENVBUFFULL (-9) -#define KDB_TOOMANYBPT (-10) -#define KDB_TOOMANYDBREGS (-11) -#define KDB_DUPBPT (-12) -#define KDB_BPTNOTFOUND (-13) -#define KDB_BADMODE (-14) -#define KDB_BADINT (-15) -#define KDB_INVADDRFMT (-16) -#define KDB_BADREG (-17) -#define KDB_BADCPUNUM (-18) -#define KDB_BADLENGTH (-19) -#define KDB_NOBP (-20) -#define KDB_BADADDR (-21) - /* Kernel Debugger Command codes. Must not overlap with error codes. */ #define KDB_CMD_GO (-1001) #define KDB_CMD_CPU (-1002) @@ -93,17 +70,6 @@ */ #define KDB_MAXBPT 16 -/* Maximum number of arguments to a function */ -#define KDB_MAXARGS 16 - -typedef enum { - KDB_REPEAT_NONE = 0, /* Do not repeat this command */ - KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */ - KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */ -} kdb_repeat_t; - -typedef int (*kdb_func_t)(int, const char **); - /* Symbol table format returned by kallsyms. */ typedef struct __ksymtab { unsigned long value; /* Address of symbol */ @@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag); extern int kallsyms_symbol_complete(char *prefix_name, int max_len); /* Exported Symbols for kernel loadable modules to use. */ -extern int kdb_register(char *, kdb_func_t, char *, char *, short); -extern int kdb_register_repeat(char *, kdb_func_t, char *, char *, - short, kdb_repeat_t); -extern int kdb_unregister(char *); - extern int kdb_getarea_size(void *, unsigned long, size_t); extern int kdb_putarea_size(unsigned long, void *, size_t); -- cgit v1.2.1 From e3bda3ac33d3bf3e5a4049e2cabe82d3caaffc26 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 11 Oct 2010 10:20:14 -0500 Subject: kdb,ftdump: Remove reference to internal kdb include Now that include/linux/kdb.h properly exports all the functions required to dynamically add a kdb shell command, the reference to the private kdb header can be removed. Signed-off-by: Jason Wessel --- kernel/trace/trace_kdb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 7b8ecd751d93..3c5c5dfea0b3 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -13,7 +13,6 @@ #include #include -#include "../debug/kdb/kdb_private.h" #include "trace.h" #include "trace_output.h" -- cgit v1.2.1 From 75d14edee5689716b55afe467acfc13206a31f95 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 11 Oct 2010 10:20:14 -0500 Subject: kdb: Fix oops in kdb_unregister Nothing should try to use kdb_commands directly as sometimes it is null. Instead, use the for_each_kdbcmd() iterator. This particular problem dates back to the initial kdb merge (2.6.35), but at that point nothing was dynamically unregistering commands from the kdb shell. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5448990a299e..4226f32517d1 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2826,7 +2826,7 @@ int kdb_unregister(char *cmd) /* * find the command. */ - for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { + for_each_kdbcmd(kp, i) { if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { kp->cmd_name = NULL; return 0; -- cgit v1.2.1 From 91b152aa85bbcf076e269565394c31964f940371 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 23 Aug 2010 09:20:14 -0500 Subject: kdb,kgdb: fix sparse fixups Fix the following sparse warnings: kdb_main.c:328:5: warning: symbol 'kdbgetu64arg' was not declared. Should it be static? kgdboc.c:246:12: warning: symbol 'kgdboc_early_init' was not declared. Should it be static? kgdb.c:652:26: warning: incorrect type in argument 1 (different address spaces) kgdb.c:652:26: expected void const *ptr kgdb.c:652:26: got struct perf_event *[noderef] *pev The one in kgdb.c required the (void * __force) because of the return code from register_wide_hw_breakpoint looking like: return (void __percpu __force *)ERR_PTR(err); Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_private.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 1921e6e4c0bc..35d69ed1dfb5 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -105,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); extern int kdb_putword(unsigned long, unsigned long, size_t); extern int kdbgetularg(const char *, unsigned long *); +extern int kdbgetu64arg(const char *, u64 *); extern char *kdbgetenv(const char *); extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, long *, char **); @@ -216,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); -#ifdef CONFIG_KALLSYMS -extern const char *kdb_walk_kallsyms(loff_t *pos); -#else /* ! CONFIG_KALLSYMS */ -static inline const char *kdb_walk_kallsyms(loff_t *pos) -{ - return NULL; -} -#endif /* ! CONFIG_KALLSYMS */ extern char *kdb_getstr(char *, size_t, char *); /* Defines for kdb_symbol_print */ -- cgit v1.2.1 From c1bb9a9c1911036549c5cdfb23f32d7d20ffdc5a Mon Sep 17 00:00:00 2001 From: Dongdong Deng Date: Mon, 13 Sep 2010 06:58:00 -0500 Subject: debug_core: disable hw_breakpoints on all cores in kgdb_cpu_enter() The slave cpus do not have the hw breakpoints disabled upon entry to the debug_core and as a result could cause unrecoverable recursive faults on badly placed breakpoints, or get out of sync with the arch specific hw breakpoint operations. This patch addresses the problem by invoking kgdb_disable_hw_debug() earlier in kgdb_enter_cpu for each cpu that enters the debug core. The hw breakpoint dis/enable flow should be: master_debug_cpu slave_debug_cpu \ / kgdb_cpu_enter | kgdb_disable_hw_debug --> uninstall pre-enabled hw_breakpoint | do add/rm dis/enable operates to hw_breakpoints on master_debug_cpu.. | correct_hw_break --> correct/install the enabled hw_breakpoint | leave_kgdb Signed-off-by: Dongdong Deng Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5a3b04d20497..bb9497724808 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -485,6 +485,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) int error; int i, cpu; int trace_on = 0; + + kgdb_disable_hw_debug(ks->linux_regs); + acquirelock: /* * Interrupts will be restored by the 'trap return' code, except when @@ -569,8 +572,6 @@ return_normal: if (dbg_io_ops->pre_exception) dbg_io_ops->pre_exception(); - kgdb_disable_hw_debug(ks->linux_regs); - /* * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active @@ -661,6 +662,8 @@ kgdb_restore: else kgdb_sstep_pid = 0; } + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); /* Free kgdb_active */ -- cgit v1.2.1 From dfee3a7b92208b30f77876068aece9ea571270c2 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 21 May 2010 08:46:00 -0500 Subject: debug_core: refactor locking for master/slave cpus For quite some time there have been problems with memory barriers and various races with NMI on multi processor systems using the kernel debugger. The algorithm for entering the kernel debug core and resuming kernel execution was racy and had several known edge case problems with attempting to debug something on a heavily loaded system using breakpoints that are hit repeatedly and quickly. The prior "locking" design entry worked as follows: * The atomic counter kgdb_active was used with atomic exchange in order to elect a master cpu out of all the cpus that may have taken a debug exception. * The master cpu increments all elements of passive_cpu_wait[]. * The master cpu issues the round up cpus message. * Each "slave cpu" that enters the debug core increments its own element in cpu_in_kgdb[]. * Each "slave cpu" spins on passive_cpu_wait[] until it becomes 0. * The master cpu debugs the system. The new scheme removes the two arrays of atomic counters and replaces them with 2 single counters. One counter is used to count the number of cpus waiting to become a master cpu (because one or more hit an exception). The second counter is use to indicate how many cpus have entered as slave cpus. The new entry logic works as follows: * One or more cpus enters via kgdb_handle_exception() and increments the masters_in_kgdb. Each cpu attempts to get the spin lock called dbg_master_lock. * The master cpu sets kgdb_active to the current cpu. * The master cpu takes the spinlock dbg_slave_lock. * The master cpu asks to round up all the other cpus. * Each slave cpu that is not already in kgdb_handle_exception() will enter and increment slaves_in_kgdb. Each slave will now spin try_locking on dbg_slave_lock. * The master cpu waits for the sum of masters_in_kgdb and slaves_in_kgdb to be equal to the sum of the online cpus. * The master cpu debugs the system. In the new design the kgdb_active can only be changed while holding dbg_master_lock. Stress testing has not turned up any further entry/exit races that existed in the prior locking design. The prior locking design suffered from atomic variables not being truly atomic (in the capacity as used by kgdb) along with memory barrier races. Signed-off-by: Jason Wessel Acked-by: Dongdong Deng --- kernel/debug/debug_core.c | 105 ++++++++++++++++++++++++---------------------- kernel/debug/debug_core.h | 1 + 2 files changed, 56 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index bb9497724808..26dbdc37d219 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -110,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { */ atomic_t kgdb_active = ATOMIC_INIT(-1); EXPORT_SYMBOL_GPL(kgdb_active); +static DEFINE_RAW_SPINLOCK(dbg_master_lock); +static DEFINE_RAW_SPINLOCK(dbg_slave_lock); /* * We use NR_CPUs not PERCPU, in case kgdb is used to debug early * bootup code (which might not have percpu set up yet): */ -static atomic_t passive_cpu_wait[NR_CPUS]; -static atomic_t cpu_in_kgdb[NR_CPUS]; +static atomic_t masters_in_kgdb; +static atomic_t slaves_in_kgdb; static atomic_t kgdb_break_tasklet_var; atomic_t kgdb_setting_breakpoint; @@ -478,14 +480,23 @@ static void dbg_touch_watchdogs(void) rcu_cpu_stall_reset(); } -static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) +static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, + int exception_state) { unsigned long flags; int sstep_tries = 100; int error; - int i, cpu; + int cpu; int trace_on = 0; + int online_cpus = num_online_cpus(); + kgdb_info[ks->cpu].enter_kgdb++; + kgdb_info[ks->cpu].exception_state |= exception_state; + + if (exception_state == DCPU_WANT_MASTER) + atomic_inc(&masters_in_kgdb); + else + atomic_inc(&slaves_in_kgdb); kgdb_disable_hw_debug(ks->linux_regs); acquirelock: @@ -500,14 +511,15 @@ acquirelock: kgdb_info[cpu].task = current; kgdb_info[cpu].ret_state = 0; kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; - /* - * Make sure the above info reaches the primary CPU before - * our cpu_in_kgdb[] flag setting does: - */ - atomic_inc(&cpu_in_kgdb[cpu]); - if (exception_level == 1) + /* Make sure the above info reaches the primary CPU */ + smp_mb(); + + if (exception_level == 1) { + if (raw_spin_trylock(&dbg_master_lock)) + atomic_xchg(&kgdb_active, cpu); goto cpu_master_loop; + } /* * CPU will loop if it is a slave or request to become a kgdb @@ -519,10 +531,12 @@ cpu_loop: kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; goto cpu_master_loop; } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { - if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) + if (raw_spin_trylock(&dbg_master_lock)) { + atomic_xchg(&kgdb_active, cpu); break; + } } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { - if (!atomic_read(&passive_cpu_wait[cpu])) + if (!raw_spin_is_locked(&dbg_slave_lock)) goto return_normal; } else { return_normal: @@ -533,7 +547,11 @@ return_normal: arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); - atomic_dec(&cpu_in_kgdb[cpu]); + kgdb_info[cpu].exception_state &= + ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); + kgdb_info[cpu].enter_kgdb--; + smp_mb__before_atomic_dec(); + atomic_dec(&slaves_in_kgdb); dbg_touch_watchdogs(); local_irq_restore(flags); return 0; @@ -551,6 +569,7 @@ return_normal: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); + raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); @@ -576,10 +595,8 @@ return_normal: * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active */ - if (!kgdb_single_step) { - for (i = 0; i < NR_CPUS; i++) - atomic_inc(&passive_cpu_wait[i]); - } + if (!kgdb_single_step) + raw_spin_lock(&dbg_slave_lock); #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ @@ -590,10 +607,9 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - for_each_online_cpu(i) { - while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) - cpu_relax(); - } + while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + + atomic_read(&slaves_in_kgdb)) != online_cpus) + cpu_relax(); /* * At this point the primary processor is completely @@ -634,24 +650,11 @@ cpu_master_loop: if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); - atomic_dec(&cpu_in_kgdb[ks->cpu]); - if (!kgdb_single_step) { - for (i = NR_CPUS-1; i >= 0; i--) - atomic_dec(&passive_cpu_wait[i]); - /* - * Wait till all the CPUs have quit from the debugger, - * but allow a CPU that hit an exception and is - * waiting to become the master to remain in the debug - * core. - */ - for_each_online_cpu(i) { - while (kgdb_do_roundup && - atomic_read(&cpu_in_kgdb[i]) && - !(kgdb_info[i].exception_state & - DCPU_WANT_MASTER)) - cpu_relax(); - } + raw_spin_unlock(&dbg_slave_lock); + /* Wait till all the CPUs have quit from the debugger. */ + while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb)) + cpu_relax(); } kgdb_restore: @@ -666,8 +669,15 @@ kgdb_restore: arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); + + kgdb_info[cpu].exception_state &= + ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); + kgdb_info[cpu].enter_kgdb--; + smp_mb__before_atomic_dec(); + atomic_dec(&masters_in_kgdb); /* Free kgdb_active */ atomic_set(&kgdb_active, -1); + raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); @@ -686,7 +696,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; - int ret; ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; @@ -697,11 +706,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (kgdb_reenter_check(ks)) return 0; /* Ouch, double exception ! */ - kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; - ret = kgdb_cpu_enter(ks, regs); - kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | - DCPU_IS_SLAVE); - return ret; + if (kgdb_info[ks->cpu].enter_kgdb != 0) + return 0; + + return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); } int kgdb_nmicallback(int cpu, void *regs) @@ -714,12 +722,9 @@ int kgdb_nmicallback(int cpu, void *regs) ks->cpu = cpu; ks->linux_regs = regs; - if (!atomic_read(&cpu_in_kgdb[cpu]) && - atomic_read(&kgdb_active) != -1 && - atomic_read(&kgdb_active) != cpu) { - kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; - kgdb_cpu_enter(ks, regs); - kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE; + if (kgdb_info[ks->cpu].enter_kgdb == 0 && + raw_spin_is_locked(&dbg_master_lock)) { + kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE); return 0; } #endif diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index c5d753d80f67..3494c28a7e7a 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -40,6 +40,7 @@ struct debuggerinfo_struct { int exception_state; int ret_state; int irq_depth; + int enter_kgdb; }; extern struct debuggerinfo_struct kgdb_info[]; -- cgit v1.2.1 From 495363d380b4f4745bd8677912688654afc44020 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 21 May 2010 08:46:00 -0500 Subject: kdb,debug_core: adjust master cpu switch logic against new debug_core locking The kdb shell needs to enforce switching back to the original CPU that took the exception before restoring normal kernel execution. Resuming from a different CPU than what took the original exception will cause problems with spin locks that are freed from the a different processor than had taken the lock. The special logic in dbg_cpu_switch() can go away entirely with because the state of what cpus want to be masters or slaves will remain unchanged between entry and exit of the debug_core exception context. Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 16 ++-------------- kernel/debug/kdb/kdb_debugger.c | 3 +-- kernel/debug/kdb/kdb_main.c | 12 ++++++------ 3 files changed, 9 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 26dbdc37d219..fec596da9bd0 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -460,19 +460,6 @@ static int kgdb_reenter_check(struct kgdb_state *ks) return 1; } -static void dbg_cpu_switch(int cpu, int next_cpu) -{ - /* Mark the cpu we are switching away from as a slave when it - * holds the kgdb_active token. This must be done so that the - * that all the cpus wait in for the debug core will not enter - * again as the master. */ - if (cpu == atomic_read(&kgdb_active)) { - kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; - kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER; - } - kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER; -} - static void dbg_touch_watchdogs(void) { touch_softlockup_watchdog_sync(); @@ -638,7 +625,8 @@ cpu_master_loop: if (error == DBG_PASS_EVENT) { dbg_kdb_mode = !dbg_kdb_mode; } else if (error == DBG_SWITCH_CPU_EVENT) { - dbg_cpu_switch(cpu, dbg_switch_cpu); + kgdb_info[dbg_switch_cpu].exception_state |= + DCPU_NEXT_MASTER; goto cpu_loop; } else { kgdb_info[cpu].ret_state = error; diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index bf6e8270e957..dd0b1b7dd02c 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks) } /* Set initial kdb state variables */ KDB_STATE_CLEAR(KGDB_TRANS); - kdb_initial_cpu = ks->cpu; + kdb_initial_cpu = atomic_read(&kgdb_active); kdb_current_task = kgdb_info[ks->cpu].task; kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; /* Remove any breakpoints as needed by kdb and clear single step */ @@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks) ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); } - kdb_initial_cpu = ks->cpu; if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { KDB_STATE_CLEAR(SSBPT); KDB_STATE_CLEAR(DOING_SS); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4226f32517d1..d7bda21a106b 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv) int nextarg; long offset; + if (raw_smp_processor_id() != kdb_initial_cpu) { + kdb_printf("go must execute on the entry cpu, " + "please use \"cpu %d\" and then execute go\n", + kdb_initial_cpu); + return KDB_BADCPUNUM; + } if (argc == 1) { - if (raw_smp_processor_id() != kdb_initial_cpu) { - kdb_printf("go
must be issued from the " - "initial cpu, do cpu %d first\n", - kdb_initial_cpu); - return KDB_ARGCOUNT; - } nextarg = 1; diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); -- cgit v1.2.1 From 5260562754c0aa4b95eebb1f851eaccce7286365 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Oct 2010 23:41:11 +0100 Subject: MN10300: Fix the PERCPU() alignment to allow for workqueues In the MN10300 arch, we occasionally see an assertion being tripped in alloc_cwqs() at the following line: /* just in case, make sure it's actually aligned */ ---> BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); return wq->cpu_wq.v ? 0 : -ENOMEM; The values are: wa->cpu_wq.v => 0x902776e0 align => 0x100 and align is calculated by the following: const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, __alignof__(unsigned long long)); This is because the pointer in question (wq->cpu_wq.v) loses some of its lower bits to control flags, and so the object it points to must be sufficiently aligned to avoid the need to use those bits for pointing to things. Currently, 4 control bits and 4 colour bits are used in normal circumstances, plus a debugging bit if debugging is set. This requires the cpu_workqueue_struct struct to be at least 256 bytes aligned (or 512 bytes aligned with debugging). PERCPU() alignment on MN13000, however, is only 32 bytes as set in vmlinux.lds.S. So we set this to PAGE_SIZE (4096) to match most other arches and stick a comment in alloc_cwqs() for anyone else who triggers the assertion. Reported-by: Akira Takeuchi Signed-off-by: David Howells Acked-by: Mark Salter Cc: Tejun Heo Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 30acdb74cc23..e5ff2cbaadc2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2791,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq) } } - /* just in case, make sure it's actually aligned */ + /* just in case, make sure it's actually aligned + * - this is affected by PERCPU() alignment in vmlinux.lds.S + */ BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); return wq->cpu_wq.v ? 0 : -ENOMEM; } -- cgit v1.2.1 From 7e360c38abe2c70eae3ba5a8a17f17671d8b77c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2010 09:32:55 +0200 Subject: fs: allow for more than 2^31 files Andrew, Could you please review this patch, you probably are the right guy to take it, because it crosses fs and net trees. Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt depend on previous patch (sysctl: fix min/max handling in __do_proc_doulongvec_minmax()) Thanks ! [PATCH V4] fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Signed-off-by: Al Viro --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770f..694b140852c2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", -- cgit v1.2.1 From cffbc8aa334f55c9ed42d25202eb3ebf3a97c195 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 23 Oct 2010 05:03:02 -0400 Subject: fs: Convert nr_inodes and nr_unused to per-cpu counters The number of inodes allocated does not need to be tied to the addition or removal of an inode to/from a list. If we are not tied to a list lock, we could update the counters when inodes are initialised or destroyed, but to do that we need to convert the counters to be per-cpu (i.e. independent of a lock). This means that we have the freedom to change the list/locking implementation without needing to care about the counters. Based on a patch originally from Eric Dumazet. [AV: cleaned up a bit, fixed build breakage on weird configs Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 694b140852c2..99a510cbfbb3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1340,14 +1340,14 @@ static struct ctl_table fs_table[] = { .data = &inodes_stat, .maxlen = 2*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_inodes, }, { .procname = "file-nr", -- cgit v1.2.1 From 7de9c6ee3ecffd99e1628e81a5ea5468f7581a1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Oct 2010 11:11:40 -0400 Subject: new helper: ihold() Clones an existing reference to inode; caller must already hold one. Signed-off-by: Al Viro --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a118bf160e0b..6c683b37f2ce 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); + ihold(key->shared.inode); break; case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); -- cgit v1.2.1 From 85fe4025c616a7c0ed07bc2fc8c5371b07f3888c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Oct 2010 11:19:54 -0400 Subject: fs: do not assign default i_ino in new_inode Instead of always assigning an increasing inode number in new_inode move the call to assign it into those callers that actually need it. For now callers that need it is estimated conservatively, that is the call is added to all filesystems that do not assign an i_ino by themselves. For a few more filesystems we can avoid assigning any inode number given that they aren't user visible, and for others it could be done lazily when an inode number is actually needed, but that's left for later patches. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b69b8d0313d..9270d532ec3c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -777,6 +777,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); -- cgit v1.2.1 From 312d3ca856d369bb04d0443846b85b4cdde6fa8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:23 -0400 Subject: fs: use percpu counter for nr_dentry and nr_dentry_unused The nr_dentry stat is a globally touched cacheline and atomic operation twice over the lifetime of a dentry. It is used for the benfit of userspace only. Turn it into a per-cpu counter and always decrement it in d_free instead of doing various batching operations to reduce lock hold times in the callers. Based on an earlier patch from Nick Piggin . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99a510cbfbb3..8b77ff5c502c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1377,7 +1377,7 @@ static struct ctl_table fs_table[] = { .data = &dentry_stat, .maxlen = 6*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_dentry, }, { .procname = "overflowuid", -- cgit v1.2.1 From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 26 Oct 2010 14:21:23 -0700 Subject: oom: add per-mm oom disable count It's pointless to kill a task if another thread sharing its mm cannot be killed to allow future memory freeing. A subsequent patch will prevent kills in such cases, but first it's necessary to have a way to flag a task that shares memory with an OOM_DISABLE task that doesn't incur an additional tasklist scan, which would make select_bad_process() an O(n^2) function. This patch adds an atomic counter to struct mm_struct that follows how many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN. They cannot be killed by the kernel, so their memory cannot be freed in oom conditions. This only requires task_lock() on the task that we're operating on, it does not require mm->mmap_sem since task_lock() pins the mm and the operation is atomic. [rientjes@google.com: changelog and sys_unshare() code] [rientjes@google.com: protect oom_disable_count with task_lock in fork] [rientjes@google.com: use old_mm for oom_disable_count in exec] Signed-off-by: Ying Han Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +++ kernel/fork.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e2bdf37f9fde..894179a32ec1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -687,6 +688,8 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408d..e87aaaaf5131 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); + atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -741,6 +743,8 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1299,8 +1303,13 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + task_lock(p); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&p->mm->oom_disable_count); + task_unlock(p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1693,6 +1702,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) active_mm = current->active_mm; current->mm = new_mm; current->active_mm = new_mm; + if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&mm->oom_disable_count); + atomic_inc(&new_mm->oom_disable_count); + } activate_mm(active_mm, new_mm); new_mm = mm; } -- cgit v1.2.1 From 61ecdb801ef2cd28e32442383106d7837d76deac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:21:47 -0700 Subject: mm: strictly nested kmap_atomic() Ensure kmap_atomic() usage is strictly nested Signed-off-by: Peter Zijlstra Reviewed-by: Rik van Riel Acked-by: Chris Metcalf Cc: David Howells Cc: Hugh Dickins Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Russell King Cc: Ralf Baechle Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ac7eb109f196..9e3581f4619a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) src = kmap_atomic(s_page, KM_USER0); dst = kmap_atomic(d_page, KM_USER1); do_copy_page(dst, src); - kunmap_atomic(src, KM_USER0); kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel @@ -2273,8 +2273,8 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) memcpy(buf, kaddr1, PAGE_SIZE); memcpy(kaddr1, kaddr2, PAGE_SIZE); memcpy(kaddr2, buf, PAGE_SIZE); - kunmap_atomic(kaddr1, KM_USER0); kunmap_atomic(kaddr2, KM_USER1); + kunmap_atomic(kaddr1, KM_USER0); } /** -- cgit v1.2.1 From 3ecb01df3261d3b1f02ccfcf8384e2a255d2a1d0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Oct 2010 14:22:27 -0700 Subject: use clear_page()/copy_page() in favor of memset()/memcpy() on whole pages After all that's what they are intended for. Signed-off-by: Jan Beulich Cc: Miklos Szeredi Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 +- kernel/power/snapshot.c | 14 +++++++------- kernel/power/swap.c | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c0613f7d6730..b55045bc7563 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image, ptr = kmap(page); /* Start with a clear page */ - memset(ptr, 0, PAGE_SIZE); + clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); if (mchunk > mbytes) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 9e3581f4619a..0dac75ea4456 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) */ safe_copy_page(buffer, s_page); dst = kmap_atomic(d_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); } else { safe_copy_page(page_address(d_page), s_page); @@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); + clear_page(buffer); pack_pfns(buffer, &orig_bm); } else { struct page *page; @@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle) void *kaddr; kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); + copy_page(buffer, kaddr); kunmap_atomic(kaddr, KM_USER0); handle->buffer = buffer; } else { @@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void) void *dst; dst = kmap_atomic(last_highmem_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); last_highmem_page = NULL; } @@ -2270,9 +2270,9 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) kaddr1 = kmap_atomic(p1, KM_USER0); kaddr2 = kmap_atomic(p2, KM_USER1); - memcpy(buf, kaddr1, PAGE_SIZE); - memcpy(kaddr1, kaddr2, PAGE_SIZE); - memcpy(kaddr2, buf, PAGE_SIZE); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); kunmap_atomic(kaddr2, KM_USER1); kunmap_atomic(kaddr1, KM_USER0); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 916eaa790399..a0e4a86ccf94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -251,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (bio_chain) { src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (src) { - memcpy(src, buf, PAGE_SIZE); + copy_page(src, buf); } else { WARN_ON_ONCE(1); bio_chain = NULL; /* Go synchronous */ @@ -325,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, error = write_page(handle->cur, handle->cur_swap, NULL); if (error) goto out; - memset(handle->cur, 0, PAGE_SIZE); + clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; } @@ -910,7 +910,7 @@ int swsusp_check(void) hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, PAGE_SIZE); + clear_page(swsusp_header); error = hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) -- cgit v1.2.1 From ca1cab37d91cbe8a8333732540d43cabb54cfa85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:22:34 -0700 Subject: workqueues: s/ON_STACK/ONSTACK/ Silly though it is, completions and wait_queue_heads use foo_ONSTACK (COMPLETION_INITIALIZER_ONSTACK, DECLARE_COMPLETION_ONSTACK, __WAIT_QUEUE_HEAD_INIT_ONSTACK and DECLARE_WAIT_QUEUE_HEAD_ONSTACK) so I guess workqueues should do the same thing. s/INIT_WORK_ON_STACK/INIT_WORK_ONSTACK/ s/INIT_DELAYED_WORK_ON_STACK/INIT_DELAYED_WORK_ONSTACK/ Cc: Peter Zijlstra Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e5ff2cbaadc2..90db1bd1a978 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2064,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, * checks and call back into the fixup functions where we * might deadlock. */ - INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); + INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); -- cgit v1.2.1 From 571428be550fbe37160596995e96ad398873fcbd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:43 -0700 Subject: kernel/user.c: add lock release annotation on free_user() free_user() releases uidhash_lock but was missing annotation. Add it. This removes following sparse warnings: include/linux/spinlock.h:339:9: warning: context imbalance in 'free_user' - unexpected unlock kernel/user.c:120:6: warning: context imbalance in 'free_uid' - wrong count at exit Signed-off-by: Namhyung Kim Cc: Ingo Molnar Cc: Dhaval Giani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 7e72614b736d..2c7d8d5914b1 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) + __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); -- cgit v1.2.1 From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770f..694b140852c2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", -- cgit v1.2.1 From ca51c5a76345b28c6f1b742f9f5f0a6fc9afd9ca Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: kernel/stop_machine.c: fix unused variable warning kernel/stop_machine.c: In function `cpu_stopper_thread': kernel/stop_machine.c:265: warning: unused variable `ksym_buf' ksym_buf[] is unused if WARN_ON() is a no-op. Signed-off-by: Rakib Mullick Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 090c28812ce1..3cb4a9a8ae1c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -262,7 +262,7 @@ repeat: cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; - char ksym_buf[KSYM_NAME_LEN]; + char ksym_buf[KSYM_NAME_LEN] __maybe_unused; __set_current_state(TASK_RUNNING); -- cgit v1.2.1 From 4ce6494dbd8909718840bb88d5a699ef6ce5c212 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 26 Oct 2010 14:22:45 -0700 Subject: stop_machine: convert cpu notifier to return encapsulate errno value In commit e6bde73b07edeb703d4c89c1daabc09c303de11f ("cpu-hotplug: return better errno on cpu hotplug failure"), the cpu notifier can return an encapsulated errno value. This converts the cpu notifier to return an encapsulated errno value for stop_machine(). Signed-off-by: Akinobu Mita Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 3cb4a9a8ae1c..2df820b03beb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", cpu); if (IS_ERR(p)) - return NOTIFY_BAD; + return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); kthread_bind(p, cpu); sched_set_stop_task(cpu, p); @@ -372,7 +372,7 @@ static int __init cpu_stop_init(void) /* start one for the boot cpu */ err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, bcpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); register_cpu_notifier(&cpu_stop_cpu_notifier); -- cgit v1.2.1 From 6c095efd82e8f6a98515426a733110f91cf0a709 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:47 -0700 Subject: printk: fixup declaration of kmsg_reasons Move redundant 'const' after '*' to make pointer itself const Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2531017795f6..d18933a92819 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1511,7 +1511,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); -static const char const *kmsg_reasons[] = { +static const char * const kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", [KMSG_DUMP_KEXEC] = "kexec", -- cgit v1.2.1 From 8155c02a44a95562e1ae0999360eb31288d7195a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:47 -0700 Subject: printk: add lock context annotation acquire_console_semaphore_for_printk() releases logbuf_lock but was missing proper annotation. Add it. Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index d18933a92819..6ff26151f4ff 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -647,6 +647,7 @@ static inline int can_use_console(unsigned int cpu) * released but interrupts still disabled. */ static int acquire_console_semaphore_for_printk(unsigned int cpu) + __releases(&logbuf_lock) { int retval = 0; -- cgit v1.2.1 From 674dff6507d3f9b110219ea125cf5e1213c9acef Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:48 -0700 Subject: printk: change type of 'boot_delay' to int * get_option() takes its 2nd arg as int * so passing boot_delay to it caused following warnings from sparse: kernel/printk.c:223:27: warning: incorrect type in argument 2 (different signedness) kernel/printk.c:223:27: expected int *pint kernel/printk.c:223:27: got unsigned int static [toplevel] * Since boot_delay can't grow more than 10,000 changing it to 'int *' will not produce any problem. Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6ff26151f4ff..b2ebaee8c377 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY -static unsigned int boot_delay; /* msecs delay after each printk during bootup */ +static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) -- cgit v1.2.1 From f5d87d851d76a390d0fab2f77bd1d563d69ee586 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:49 -0700 Subject: printk: declare printk_ratelimit_state in ratelimit.h Adding declaration of printk_ratelimit_state in ratelimit.h removes potential build breakage and following sparse warning: kernel/printk.c:1426:1: warning: symbol 'printk_ratelimit_state' was not declared. Should it be static? [akpm@linux-foundation.org: remove unneeded ifdef] Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 694b140852c2..48d9d689498f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -161,8 +161,6 @@ extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif -extern struct ratelimit_state printk_ratelimit_state; - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.1 From ee2f154a598e96df2ebb01648a7699373bc085c7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 26 Oct 2010 14:17:25 -0700 Subject: docbook: add more wait/wake/completion to device-drivers docbook Add more wait, wake, and completion interfaces to the device-drivers docbook. Fix kernel-doc notation in the added files. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/wait.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index c4bd3d825f35..b0310eb6cc1e 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); -/* +/** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor @@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -/* +/** * abort_exclusive_wait - abort exclusive waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor - * @state: runstate of the waiter to be woken + * @mode: runstate of the waiter to be woken * @key: key to identify a wait bit queue or %NULL * * Sets current thread back to running state and removes -- cgit v1.2.1 From abbce906d05ec37289cd0c3b4e35b2db26eab19b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Sep 2010 01:58:08 +0200 Subject: (trivial) Fix compiler warning in kernel/modules.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with CONFIG_KALLSYMS=n gives following warning: /mnt/src/linux-git/kernel/module.c: In function ‘post_relocation’: /mnt/src/linux-git/kernel/module.c:2534:2: warning: passing argument 2 of ‘add_kallsyms’ discards qualifiers from pointer target type /mnt/src/linux-git/kernel/module.c:2038:13: note: expected ‘struct load_info *’ but argument is of type ‘const struct load_info *’ Signed-off-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2df46301a7a4..437a74a7524a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info) { } -static void add_kallsyms(struct module *mod, struct load_info *info) +static void add_kallsyms(struct module *mod, const struct load_info *info) { } #endif /* CONFIG_KALLSYMS */ -- cgit v1.2.1 From 3a5f65df5a0fcbaa35e5417c0420d691fee4ac56 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Oct 2010 17:28:36 +0100 Subject: Typedef SMP call function pointer Typedef the pointer to the function to be called by smp_call_function() and friends: typedef void (*smp_call_func_t)(void *info); as it is used in a fair number of places. Signed-off-by: David Howells cc: linux-arch@vger.kernel.org --- kernel/smp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ed6aacfcb7ef..12ed8b013e2d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); * * Returns 0 on success, else a negative status code. */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, +int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { struct call_single_data d = { @@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single); * 3) any other online cpu in @mask */ int smp_call_function_any(const struct cpumask *mask, - void (*func)(void *info), void *info, int wait) + smp_call_func_t func, void *info, int wait) { unsigned int cpu; const struct cpumask *nodemask; @@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, - void (*func)(void *), void *info, bool wait) + smp_call_func_t func, void *info, bool wait) { struct call_function_data *data; unsigned long flags; @@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many); * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function(void (*func)(void *), void *info, int wait) +int smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); -- cgit v1.2.1 From d5de4ddb1bc430289bede76c0d87cabee93f749a Mon Sep 17 00:00:00 2001 From: Tomasz Buchert Date: Wed, 27 Oct 2010 15:33:32 -0700 Subject: cgroup_freezer: unnecessary test in cgroup_freezing_or_frozen() The root freezer_state is always CGROUP_THAWED so we can remove the special case from the code. The test itself can be handy and is extracted to static function. Signed-off-by: Tomasz Buchert Cc: Matt Helsley Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ce71ed53e88f..321e2a007d25 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -int cgroup_freezing_or_frozen(struct task_struct *task) +static inline int __cgroup_freezing_or_frozen(struct task_struct *task) { - struct freezer *freezer; - enum freezer_state state; + enum freezer_state state = task_freezer(task)->state; + return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); +} +int cgroup_freezing_or_frozen(struct task_struct *task) +{ + int result; task_lock(task); - freezer = task_freezer(task); - if (!freezer->css.cgroup->parent) - state = CGROUP_THAWED; /* root cgroup can't be frozen */ - else - state = freezer->state; + result = __cgroup_freezing_or_frozen(task); task_unlock(task); - - return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); + return result; } /* -- cgit v1.2.1 From 0bdba580ab052a21e3eda2764ed22d9ee962392b Mon Sep 17 00:00:00 2001 From: Tomasz Buchert Date: Wed, 27 Oct 2010 15:33:33 -0700 Subject: cgroup_freezer: fix can_attach() to prohibit moving from/to freezing/frozen cgroups It is possible to move a task from its cgroup even if this group is 'FREEZING'. This results in a nasty bug - the moved task will become frozen OUTSIDE its original cgroup and will remain in a permanent 'D' state. This patch allows to migrate the task only between THAWED cgroups. This behavior was observed and easily reproduced on a single core laptop. Notice that reproducibility depends highly on the machine used. Program and instructions how to reproduce the bug can be fetched from: http://pentium.hopto.org/~thinred/repos/linux-misc/freezer_bug.c Signed-off-by: Tomasz Buchert Cc: Matt Helsley Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 321e2a007d25..c287627bed3d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -173,24 +173,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss, /* * Anything frozen can't move or be moved to/from. - * - * Since orig_freezer->state == FROZEN means that @task has been - * frozen, so it's sufficient to check the latter condition. */ - if (is_task_frozen_enough(task)) + freezer = cgroup_freezer(new_cgroup); + if (freezer->state != CGROUP_THAWED) return -EBUSY; - freezer = cgroup_freezer(new_cgroup); - if (freezer->state == CGROUP_FROZEN) + rcu_read_lock(); + if (__cgroup_freezing_or_frozen(task)) { + rcu_read_unlock(); return -EBUSY; + } + rcu_read_unlock(); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (is_task_frozen_enough(c)) { + if (__cgroup_freezing_or_frozen(c)) { rcu_read_unlock(); return -EBUSY; } -- cgit v1.2.1 From 2d3cbf8bc852ac1bc3d098186143c5973f87b753 Mon Sep 17 00:00:00 2001 From: Tomasz Buchert Date: Wed, 27 Oct 2010 15:33:34 -0700 Subject: cgroup_freezer: update_freezer_state() does incorrect state transitions There are 4 state transitions possible for a freezer. Only FREEZING -> FROZEN transaction is done lazily. This patch allows update_freezer_state only to perform this transaction and renames the function to update_if_frozen. Moreover is_task_frozen_enough function is removed and its every occurence is replaced with frozen(). Therefore for a group to become FROZEN every task must be frozen. The previous version could trigger a following bug: When cgroup is in the process of freezing (but none of its tasks are frozen yet), update_freezer_state() (called from freezer_read or freezer_write) would incorrectly report that a group is 'THAWED' (because nfrozen = 0), allowing the transaction FREEZING -> THAWED without writing anything to 'freezer.state'. This is incorrect according to the documentation. This could result in a 'THAWED' cgroup with frozen tasks inside. A code to reproduce this bug is available here: http://pentium.hopto.org/~thinred/repos/linux-misc/freezer_bug2.c [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Tomasz Buchert Cc: Matt Helsley Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index c287627bed3d..e7bebb7c6c38 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -153,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss, kfree(cgroup_freezer(cgroup)); } -/* Task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); -} - /* * The call to cgroup_lock() in the freezer.state write method prevents * a write to that file racing against an attach, and hence the @@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) /* * caller must hold freezer->lock */ -static void update_freezer_state(struct cgroup *cgroup, +static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; unsigned int nfrozen = 0, ntotal = 0; + enum freezer_state old_state = freezer->state; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (is_task_frozen_enough(task)) + if (frozen(task)) nfrozen++; } - /* - * Transition to FROZEN when no new tasks can be added ensures - * that we never exist in the FROZEN state while there are unfrozen - * tasks. - */ - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - else if (nfrozen > 0) - freezer->state = CGROUP_FREEZING; - else - freezer->state = CGROUP_THAWED; + if (old_state == CGROUP_THAWED) { + BUG_ON(nfrozen > 0); + } else if (old_state == CGROUP_FREEZING) { + if (nfrozen == ntotal) + freezer->state = CGROUP_FROZEN; + } else { /* old_state == CGROUP_FROZEN */ + BUG_ON(nfrozen != ntotal); + } + cgroup_iter_end(cgroup, &it); } @@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, if (state == CGROUP_FREEZING) { /* We change from FREEZING to FROZEN lazily if the cgroup was * only partially frozen when we exitted write. */ - update_freezer_state(cgroup, freezer); + update_if_frozen(cgroup, freezer); state = freezer->state; } spin_unlock_irq(&freezer->lock); @@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) continue; - if (is_task_frozen_enough(task)) + if (frozen(task)) continue; if (!freezing(task) && !freezer_should_skip(task)) num_cant_freeze_now++; @@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); - update_freezer_state(cgroup, freezer); + update_if_frozen(cgroup, freezer); if (goal_state == freezer->state) goto out; -- cgit v1.2.1 From 97978e6d1f2da0073416870410459694fbdbfd9b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Oct 2010 15:33:35 -0700 Subject: cgroup: add clone_children control file The ns_cgroup is a control group interacting with the namespaces. When a new namespace is created, a corresponding cgroup is automatically created too. The cgroup name is the pid of the process who did 'unshare' or the child of 'clone'. This cgroup is tied with the namespace because it prevents a process to escape the control group and use the post_clone callback, so the child cgroup inherits the values of the parent cgroup. Unfortunately, the more we use this cgroup and the more we are facing problems with it: (1) when a process unshares, the cgroup name may conflict with a previous cgroup with the same pid, so unshare or clone return -EEXIST (2) the cgroup creation is out of control because there may have an application creating several namespaces where the system will automatically create several cgroups in his back and let them on the cgroupfs (eg. a vrf based on the network namespace). (3) the mix of (1) and (2) force an administrator to regularly check and clean these cgroups. This patchset removes the ns_cgroup by adding a new flag to the cgroup and the cgroupfs mount option. It enables the copy of the parent cgroup when a child cgroup is created. We can then safely remove the ns_cgroup as this flag brings a compatibility. We have now to manually create and add the task to a cgroup, which is consistent with the cgroup framework. This patch: Sent as an answer to a previous thread around the ns_cgroup. https://lists.linux-foundation.org/pipermail/containers/2009-June/018627.html It adds a control file 'clone_children' for a cgroup. This control file is a boolean specifying if the child cgroup should be a clone of the parent cgroup or not. The default value is 'false'. This flag makes the child cgroup to call the post_clone callback of all the subsystem, if it is available. At present, the cpuset is the only one which had implemented the post_clone callback. The option can be set at mount time by specifying the 'clone_children' mount option. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Acked-by: Paul Menage Reviewed-by: Li Zefan Cc: Jamal Hadi Salim Cc: Matt Helsley Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9270d532ec3c..4b218a46ddd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +static int clone_children(const struct cgroup *cgrp) +{ + return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +} + /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (clone_children(&root->top_cgroup)) + seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); @@ -1050,6 +1057,7 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + bool clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; @@ -1097,6 +1105,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); + } else if (!strcmp(token, "clone_children")) { + opts->clone_children = true; } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) @@ -1355,6 +1365,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); + if (opts->clone_children) + set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); return root; } @@ -3173,6 +3185,23 @@ fail: return ret; } +static u64 cgroup_clone_children_read(struct cgroup *cgrp, + struct cftype *cft) +{ + return clone_children(cgrp); +} + +static int cgroup_clone_children_write(struct cgroup *cgrp, + struct cftype *cft, + u64 val) +{ + if (val) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + else + clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + return 0; +} + /* * for the common functions, 'private' gives the type of file */ @@ -3203,6 +3232,11 @@ static struct cftype files[] = { .write_string = cgroup_write_event_control, .mode = S_IWUGO, }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, }; static struct cftype cft_release_agent = { @@ -3332,6 +3366,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + if (clone_children(parent)) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); @@ -3346,6 +3383,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_destroy; } /* At error, ->destroy() callback has to free assigned ID. */ + if (clone_children(parent) && ss->post_clone) + ss->post_clone(ss, cgrp); } cgroup_lock_hierarchy(root); -- cgit v1.2.1 From 32a8cf235e2f192eb002755076994525cdbaa35a Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Oct 2010 15:33:37 -0700 Subject: cgroup: make the mount options parsing more accurate Current behavior: ================= (1) When we mount a cgroup, we can specify the 'all' option which means to enable all the cgroup subsystems. This is the default option when no option is specified. (2) If we want to mount a cgroup with a subset of the supported cgroup subsystems, we have to specify a subsystems name list for the mount option. (3) If we specify another option like 'noprefix' or 'release_agent', the actual code wants the 'all' or a subsystem name option specified also. Not critical but a bit not friendly as we should assume (1) in this case. (4) Logically, the 'all' option is mutually exclusive with a subsystem name, but this is not detected. In other words: succeed : mount -t cgroup -o all,freezer cgroup /cgroup => is it 'all' or 'freezer' ? fails : mount -t cgroup -o noprefix cgroup /cgroup => succeed if we do '-o noprefix,all' The following patches consolidate a bit the mount options check. New behavior: ============= (1) untouched (2) untouched (3) the 'all' option will be by default when specifying other than a subsystem name option (4) raises an error In other words: fails : mount -t cgroup -o all,freezer cgroup /cgroup succeed : mount -t cgroup -o noprefix cgroup /cgroup For the sake of lisibility, the if ... then ... else ... if ... indentation when parsing the options has been changed to: if ... then ... continue fi Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Reviewed-by: Li Zefan Reviewed-by: Paul Menage Cc: Eric W. Biederman Cc: Jamal Hadi Salim Cc: Matt Helsley Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 90 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4b218a46ddd3..3e6517e51fd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1074,7 +1074,8 @@ struct cgroup_sb_opts { */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { - char *token, *o = data ?: "all"; + char *token, *o = data; + bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; int i; bool module_pin_failed = false; @@ -1090,24 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) while ((token = strsep(&o, ",")) != NULL) { if (!*token) return -EINVAL; - if (!strcmp(token, "all")) { - /* Add all non-disabled subsystems */ - opts->subsys_bits = 0; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (!ss->disabled) - opts->subsys_bits |= 1ul << i; - } - } else if (!strcmp(token, "none")) { + if (!strcmp(token, "none")) { /* Explicitly have no subsystems */ opts->none = true; - } else if (!strcmp(token, "noprefix")) { + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); - } else if (!strcmp(token, "clone_children")) { + continue; + } + if (!strcmp(token, "clone_children")) { opts->clone_children = true; - } else if (!strncmp(token, "release_agent=", 14)) { + continue; + } + if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; @@ -1115,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - } else if (!strncmp(token, "name=", 5)) { + continue; + } + if (!strncmp(token, "name=", 5)) { const char *name = token + 5; /* Can't specify an empty name */ if (!strlen(name)) @@ -1137,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) GFP_KERNEL); if (!opts->name) return -ENOMEM; - } else { - struct cgroup_subsys *ss; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - ss = subsys[i]; - if (ss == NULL) - continue; - if (!strcmp(token, ss->name)) { - if (!ss->disabled) - set_bit(i, &opts->subsys_bits); - break; - } - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + + continue; + } + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (strcmp(token, ss->name)) + continue; + if (ss->disabled) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + set_bit(i, &opts->subsys_bits); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise 'all, 'none' and a subsystem name options were not + * specified, let's default to 'all' + */ + if (all_ss || (!all_ss && !one_ss && !opts->none)) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (ss->disabled) + continue; + set_bit(i, &opts->subsys_bits); } } -- cgit v1.2.1 From f4a2589feaef0a9b737a3e582b37ee96695bb25f Mon Sep 17 00:00:00 2001 From: Evgeny Kuznetsov Date: Wed, 27 Oct 2010 15:33:37 -0700 Subject: cgroups: add check for strcpy destination string overflow Function "strcpy" is used without check for maximum allowed source string length and could cause destination string overflow. Check for string length is added before using "strcpy". Function now is return error if source string length is more than a maximum. akpm: presently considered NotABug, but add the check for general future-safeness and robustness. Signed-off-by: Evgeny Kuznetsov Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3e6517e51fd3..5cf366965d0c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1922,6 +1922,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + if (strlen(buffer) >= PATH_MAX) + return -EINVAL; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; strcpy(cgrp->root->release_agent_path, buffer); -- cgit v1.2.1 From 45531757b45cae0ce64c5aff08c2534d5a0fa3e7 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Oct 2010 15:33:38 -0700 Subject: cgroup: notify ns_cgroup deprecated The ns_cgroup will be removed very soon. Let's warn, for this version, ns_cgroup is deprecated. Make ns_cgroup and clone_children exclusive. If the clone_children is set and the ns_cgroup is mounted, let's fail with EINVAL when the ns_cgroup subsys is created (a printk will help the user to understand why the creation fails). Update the feature remove schedule file with the deprecated ns_cgroup. Signed-off-by: Daniel Lezcano Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ns_cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 2a5dfec8efe0..2c98ad94ba0e 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, return ERR_PTR(-EPERM); if (!cgroup_is_descendant(cgroup, current)) return ERR_PTR(-EPERM); + if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { + printk("ns_cgroup can't be created with parent " + "'clone_children' set.\n"); + return ERR_PTR(-EINVAL); + } + + printk_once("ns_cgroup deprecated: consider using the " + "'clone_children' flag without the ns_cgroup.\n"); ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); if (!ns_cgroup) -- cgit v1.2.1 From c4b5ed250eebf854d40f27b43362c80f115cb57a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:33:44 -0700 Subject: ptrace: annotate lock context change on exit_ptrace() exit_ptrace() releases and regrabs tasklist_lock but was missing proper annotation. Add it. Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f34d798ef4a2..4afd9b86cc0b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) * and reacquire the lock. */ void exit_ptrace(struct task_struct *tracer) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) { struct task_struct *p, *n; LIST_HEAD(ptrace_dead); -- cgit v1.2.1 From 4abf986960ecda6a87fc2f795aacf888a2f0127e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:33:45 -0700 Subject: ptrace: change signature of sys_ptrace() and friends Since userspace API of ptrace syscall defines @addr and @data as void pointers, it would be more appropriate to define them as unsigned long in kernel. Therefore related functions are changed also. 'unsigned long' is typically used in other places in kernel as an opaque data type and that using this helps cleaning up a lot of warnings from sparse. Suggested-by: Arnd Bergmann Signed-off-by: Namhyung Kim Acked-by: Arnd Bergmann Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4afd9b86cc0b..06981a8b271b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -404,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds return copied; } -static int ptrace_setoptions(struct task_struct *child, long data) +static int ptrace_setoptions(struct task_struct *child, unsigned long data) { child->ptrace &= ~PT_TRACE_MASK; @@ -483,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) #define is_sysemu_singlestep(request) 0 #endif -static int ptrace_resume(struct task_struct *child, long request, long data) +static int ptrace_resume(struct task_struct *child, long request, + unsigned long data) { if (!valid_signal(data)) return -EIO; @@ -560,7 +561,7 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, #endif int ptrace_request(struct task_struct *child, long request, - long addr, long data) + unsigned long addr, unsigned long data) { int ret = -EIO; siginfo_t siginfo; @@ -693,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid) #define arch_ptrace_attach(child) do { } while (0) #endif -SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) +SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, + unsigned long, data) { struct task_struct *child; long ret; @@ -734,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) return ret; } -int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) +int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, + unsigned long data) { unsigned long tmp; int copied; @@ -745,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) return put_user(tmp, (unsigned long __user *)data); } -int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) +int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, + unsigned long data) { int copied; -- cgit v1.2.1 From 9fed81dc40f5a1ac2783bcc78d4029873be72894 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:33:46 -0700 Subject: ptrace: cleanup ptrace_request() Use new 'datavp' and 'datalp' variables to remove unnecesary castings. Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 06981a8b271b..ea7ce0215cd1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -565,6 +565,8 @@ int ptrace_request(struct task_struct *child, long request, { int ret = -EIO; siginfo_t siginfo; + void __user *datavp = (void __user *) data; + unsigned long __user *datalp = datavp; switch (request) { case PTRACE_PEEKTEXT: @@ -581,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setoptions(child, data); break; case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message, (unsigned long __user *) data); + ret = put_user(child->ptrace_message, datalp); break; case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) - ret = copy_siginfo_to_user((siginfo_t __user *) data, - &siginfo); + ret = copy_siginfo_to_user(datavp, &siginfo); break; case PTRACE_SETSIGINFO: - if (copy_from_user(&siginfo, (siginfo_t __user *) data, - sizeof siginfo)) + if (copy_from_user(&siginfo, datavp, sizeof siginfo)) ret = -EFAULT; else ret = ptrace_setsiginfo(child, &siginfo); @@ -624,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request, } mmput(mm); - ret = put_user(tmp, (unsigned long __user *) data); + ret = put_user(tmp, datalp); break; } #endif @@ -653,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_SETREGSET: { struct iovec kiov; - struct iovec __user *uiov = (struct iovec __user *) data; + struct iovec __user *uiov = datavp; if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) return -EFAULT; -- cgit v1.2.1 From b8ed374e202e23caaf9bd77dcadc9de6447faaa8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:34:06 -0700 Subject: signals: annotate lock_task_sighand() lock_task_sighand() grabs sighand->siglock in case of returning non-NULL but unlock_task_sighand() releases it unconditionally. This leads sparse to complain about the lock context imbalance. Rename and wrap lock_task_sighand() using __cond_lock() macro to make sparse happy. Suggested-by: Eric Dumazet Signed-off-by: Namhyung Kim Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 919562c3d6b7..e921409b85a9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p) return count; } -struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) +struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, + unsigned long *flags) { struct sighand_struct *sighand; -- cgit v1.2.1 From b84011508360d6885a9d95a235ec77d56f133377 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:34:07 -0700 Subject: signals: annotate lock context change on ptrace_stop() ptrace_stop() releases and regrabs current->sighand->siglock but was missing proper annotation. Add it. Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e921409b85a9..4e3cff10fdce 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1618,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk) * is gone, we keep current->exit_code unless clear_code. */ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) + __releases(¤t->sighand->siglock) + __acquires(¤t->sighand->siglock) { if (arch_ptrace_stop_needed(exit_code, info)) { /* -- cgit v1.2.1 From 9b1bf12d5d51bca178dea21b04a0805e29d60cf1 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 27 Oct 2010 15:34:08 -0700 Subject: signals: move cred_guard_mutex from task_struct to signal_struct Oleg Nesterov pointed out we have to prevent multiple-threads-inside-exec itself and we can reuse ->cred_guard_mutex for it. Yes, concurrent execve() has no worth. Let's move ->cred_guard_mutex from task_struct to signal_struct. It naturally prevent multiple-threads-inside-exec. Signed-off-by: KOSAKI Motohiro Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cred.c | 4 +--- kernel/fork.c | 2 ++ kernel/ptrace.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 9a3e22641fe7..6a1aa004e376 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_guard_mutex + * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { @@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - mutex_init(&p->cred_guard_mutex); - if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && diff --git a/kernel/fork.c b/kernel/fork.c index e87aaaaf5131..3b159c5991b7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -908,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + mutex_init(&sig->cred_guard_mutex); + return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ea7ce0215cd1..99bbaa3e5b0d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task) * under ptrace. */ retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->cred_guard_mutex)) + if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) goto out; task_lock(task); @@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task) unlock_tasklist: write_unlock_irq(&tasklist_lock); unlock_creds: - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); out: return retval; } -- cgit v1.2.1 From d16e15f5b029fc7d03540ba0e5fb23b0abb0ebe0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:34:10 -0700 Subject: exit: add lock context annotation on find_new_reaper() find_new_reaper() releases and regrabs tasklist_lock but was missing proper annotations. Add it. This remove following sparse warning: warning: context imbalance in 'find_new_reaper' - unexpected unlock Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 894179a32ec1..b194febf5799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -703,6 +703,8 @@ static void exit_mm(struct task_struct * tsk) * space. */ static struct task_struct *find_new_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *thread; -- cgit v1.2.1 From 478735e38887077ac77a9756121b6ce0cb956e2f Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 27 Oct 2010 15:34:15 -0700 Subject: /proc/stat: fix scalability of irq sum of all cpu In /proc/stat, the number of per-IRQ event is shown by making a sum each irq's events on all cpus. But we can make use of kstat_irqs(). kstat_irqs() do the same calculation, If !CONFIG_GENERIC_HARDIRQ, it's not a big cost. (Both of the number of cpus and irqs are small.) If a system is very big and CONFIG_GENERIC_HARDIRQ, it does for_each_irq() for_each_cpu() - look up a radix tree - read desc->irq_stat[cpu] This seems not efficient. This patch adds kstat_irqs() for CONFIG_GENRIC_HARDIRQ and change the calculation as for_each_irq() look up radix tree for_each_cpu() - read desc->irq_stat[cpu] This reduces cost. A test on (4096cpusp, 256 nodes, 4592 irqs) host (by Jack Steiner) %time cat /proc/stat > /dev/null Before Patch: 2.459 sec After Patch : .561 sec [akpm@linux-foundation.org: unexport kstat_irqs, coding-style tweaks] [akpm@linux-foundation.org: fix unused variable 'per_irq_sum'] Signed-off-by: KAMEZAWA Hiroyuki Tested-by: Jack Steiner Acked-by: Jack Steiner Cc: Yinghai Lu Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/irqdesc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9d917ff72675..9988d03797f5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) struct irq_desc *desc = irq_to_desc(irq); return desc ? desc->kstat_irqs[cpu] : 0; } + +#ifdef CONFIG_GENERIC_HARDIRQS +unsigned int kstat_irqs(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + int cpu; + int sum = 0; + + if (!desc) + return 0; + for_each_possible_cpu(cpu) + sum += desc->kstat_irqs[cpu]; + return sum; +} +#endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.2.1 From 85893120699f8bae8caa12a8ee18ab5fceac978e Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 27 Oct 2010 15:34:43 -0700 Subject: delayacct: align to 8 byte boundary on 64-bit systems prepare_reply() sets up an skb for the response. The payload contains: +--------------------------------+ | genlmsghdr - 4 bytes | +--------------------------------+ | NLA header - 4 bytes | /* Aggregate header */ +-+------------------------------+ | | NLA header - 4 bytes | /* PID header */ | +------------------------------+ | | pid/tgid - 4 bytes | | +------------------------------+ | | NLA header - 4 bytes | /* stats header */ | + -----------------------------+ <- oops. aligned on 4 byte boundary | | struct taskstats - 328 bytes | +-+------------------------------+ The start of the taskstats struct must be 8 byte aligned on IA64 (and other systems with 8 byte alignment rules for 64-bit types) or runtime alignment warnings will be issued. This patch pads the pid/tgid field out to sizeof(long), which forces the alignment of taskstats. The getdelays userspace code is ok with this since it assumes 32-bit pid/tgid and then honors that header's length field. An array is used to avoid exposing kernel memory contents to userspace in the response. Signed-off-by: Jeff Mahoney Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 11281d5792bd..5a651aa63d61 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -360,6 +360,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) struct nlattr *na, *ret; int aggr; + /* If we don't pad, we end up with alignment on a 4 byte boundary. + * This causes lots of runtime warnings on systems requiring 8 byte + * alignment */ + u32 pids[2] = { pid, 0 }; + int pid_size = ALIGN(sizeof(pid), sizeof(long)); + aggr = (type == TASKSTATS_TYPE_PID) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; @@ -367,7 +373,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) na = nla_nest_start(skb, aggr); if (!na) goto err; - if (nla_put(skb, type, sizeof(pid), &pid) < 0) + if (nla_put(skb, type, pid_size, pids) < 0) goto err; ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); if (!ret) -- cgit v1.2.1 From 9323312592cca636d7c2580dc85fa4846efa86a2 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 27 Oct 2010 15:34:44 -0700 Subject: taskstats: separate taskstats commands Move each taskstats command into a single function. This makes the code more readable and makes it easier to add new commands. Signed-off-by: Michael Holzheu Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 118 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5a651aa63d61..9970cae04f15 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -430,39 +430,46 @@ err: return rc; } -static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +static int cmd_attr_register_cpumask(struct genl_info *info) { - int rc; - struct sk_buff *rep_skb; - struct taskstats *stats; - size_t size; cpumask_var_t mask; + int rc; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) - goto free_return_rc; - if (rc == 0) { - rc = add_del_listener(info->snd_pid, mask, REGISTER); - goto free_return_rc; - } + goto out; + rc = add_del_listener(info->snd_pid, mask, REGISTER); +out: + free_cpumask_var(mask); + return rc; +} +static int cmd_attr_deregister_cpumask(struct genl_info *info) +{ + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) - goto free_return_rc; - if (rc == 0) { - rc = add_del_listener(info->snd_pid, mask, DEREGISTER); -free_return_rc: - free_cpumask_var(mask); - return rc; - } + goto out; + rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +out: free_cpumask_var(mask); + return rc; +} + +static int cmd_attr_pid(struct genl_info *info) +{ + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + u32 pid; + int rc; - /* - * Size includes space for nested attributes - */ size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); @@ -471,33 +478,64 @@ free_return_rc: return rc; rc = -EINVAL; - if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { - u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); - if (!stats) - goto err; - - rc = fill_pid(pid, NULL, stats); - if (rc < 0) - goto err; - } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { - u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); - if (!stats) - goto err; - - rc = fill_tgid(tgid, NULL, stats); - if (rc < 0) - goto err; - } else + pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); + if (!stats) + goto err; + + rc = fill_pid(pid, NULL, stats); + if (rc < 0) + goto err; + return send_reply(rep_skb, info); +err: + nlmsg_free(rep_skb); + return rc; +} + +static int cmd_attr_tgid(struct genl_info *info) +{ + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + u32 tgid; + int rc; + + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); + if (rc < 0) + return rc; + + rc = -EINVAL; + tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); + if (!stats) goto err; + rc = fill_tgid(tgid, NULL, stats); + if (rc < 0) + goto err; return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; } +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) + return cmd_attr_register_cpumask(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) + return cmd_attr_deregister_cpumask(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) + return cmd_attr_pid(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) + return cmd_attr_tgid(info); + else + return -EINVAL; +} + static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; -- cgit v1.2.1 From 3d9e0cf1fe007b88db55d43dfdb6839e1a029ca5 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 27 Oct 2010 15:34:44 -0700 Subject: taskstats: split fill_pid function Separate the finding of a task_struct by pid or tgid from filling the taskstats data. This makes the code more readable. Signed-off-by: Michael Holzheu Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 50 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9970cae04f15..c8231fb15708 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } -static int fill_pid(pid_t pid, struct task_struct *tsk, - struct taskstats *stats) +static void fill_stats(struct task_struct *tsk, struct taskstats *stats) { - int rc = 0; - - if (!tsk) { - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return -ESRCH; - } else - get_task_struct(tsk); - memset(stats, 0, sizeof(*stats)); /* * Each accounting subsystem adds calls to its functions to @@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); +} - /* Define err: label here if needed */ - put_task_struct(tsk); - return rc; +static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) +{ + struct task_struct *tsk; + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return -ESRCH; + fill_stats(tsk, stats); + put_task_struct(tsk); + return 0; } -static int fill_tgid(pid_t tgid, struct task_struct *first, - struct taskstats *stats) +static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) { - struct task_struct *tsk; + struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; @@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, * leaders who are already counted with the dead tasks */ rcu_read_lock(); - if (!first) - first = find_task_by_vpid(tgid); + first = find_task_by_vpid(tgid); if (!first || !lock_task_sighand(first, &flags)) goto out; @@ -268,7 +263,6 @@ out: return rc; } - static void fill_tgid_exit(struct task_struct *tsk) { unsigned long flags; @@ -483,7 +477,7 @@ static int cmd_attr_pid(struct genl_info *info) if (!stats) goto err; - rc = fill_pid(pid, NULL, stats); + rc = fill_stats_for_pid(pid, stats); if (rc < 0) goto err; return send_reply(rep_skb, info); @@ -513,7 +507,7 @@ static int cmd_attr_tgid(struct genl_info *info) if (!stats) goto err; - rc = fill_tgid(tgid, NULL, stats); + rc = fill_stats_for_tgid(tgid, stats); if (rc < 0) goto err; return send_reply(rep_skb, info); @@ -599,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (!stats) goto err; - rc = fill_pid(-1, tsk, stats); - if (rc < 0) - goto err; + fill_stats(tsk, stats); /* * Doesn't matter if tsk is the leader or the last group member leaving -- cgit v1.2.1 From d57af9b2142f31a39dcfdeb30776baadfc802827 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 27 Oct 2010 15:34:45 -0700 Subject: taskstats: use real microsecond granularity for CPU times The taskstats interface uses microsecond granularity for the user and system time values. The conversion from cputime to the taskstats values uses the cputime_to_msecs primitive which effectively limits the granularity to milliseconds. Add the cputime_to_usecs primitive for architectures that have better, more precise CPU time values. Remove cputime_to_msecs primitive because there are no more users left. Signed-off-by: Michael Holzheu Acked-by: Balbir Singh Cc: Luck Tony Cc: Shailabh Nagar Cc: Martin Schwidefsky Cc: Oleg Nesterov Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Thomas Gleixner Cc: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 0a67e041edf8..24dc60d9fa1f 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_ppid = pid_alive(tsk) ? rcu_dereference(tsk->real_parent)->tgid : 0; rcu_read_unlock(); - stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; - stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; - stats->ac_utimescaled = - cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; - stats->ac_stimescaled = - cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; + stats->ac_utime = cputime_to_usecs(tsk->utime); + stats->ac_stime = cputime_to_usecs(tsk->stime); + stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); + stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; -- cgit v1.2.1 From 5de1cb2d0f1c1e5475d2bedf65b76828f8cdde22 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 27 Oct 2010 15:34:52 -0700 Subject: kernel/resource.c: handle reinsertion of an already-inserted resource If the same resource is inserted to the resource tree (maybe not on purpose), a dead loop will be created. In this situation, The kernel does not report any warning or error :( The command below will show a endless print. #cat /proc/iomem [akpm@linux-foundation.org: add WARN_ON()] Signed-off-by: Huang Shijie Cc: Jesse Barnes Cc: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5dea..9c9841cb6902 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -453,6 +453,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou if (first == parent) return first; + if (WARN_ON(first == new)) /* duplicated insertion */ + return first; if ((first->start > new->start) || (first->end < new->end)) break; -- cgit v1.2.1 From 61d8e11e519ee7912ab59610fba1aaf08e3c1d84 Mon Sep 17 00:00:00 2001 From: Zimny Lech Date: Wed, 27 Oct 2010 15:34:53 -0700 Subject: Remove duplicate includes from many files Signed-off-by: Zimny Lech Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_kprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b8d2852baa4a..2dec9bcde8b4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include "trace.h" -- cgit v1.2.1 From b842f8faf6c7dc2005c6a70631c1a91bac02f180 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 1 Oct 2010 17:23:41 -0400 Subject: jump label: Fix module __init section race Jump label uses is_module_text_address() to ensure that the module __init sections are valid before updating them. However, between the check for a valid module __init section and the subsequent jump label update, the module's __init section could be freed out from under us. We fix this potential race by adding a notifier callback to the MODULE_STATE_LIVE state. This notifier is called *after* the __init section has been run but before it is going to be freed. In the callback, the jump label code zeros the key value for any __init jump code within the module, and we add a check for a non-zero key value when we update jump labels. In this way we require no additional data structures. Thanks to Mathieu Desnoyers for pointing out this race condition. Reported-by: Mathieu Desnoyers Cc: Masami Hiramatsu Signed-off-by: Jason Baron LKML-Reference: [ Renamed remove_module_init() to remove_jump_label_module_init() as suggested by Masami Hiramatsu. ] Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7be868bf25c6..be9e105345eb 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -168,7 +168,8 @@ void jump_label_update(unsigned long key, enum jump_label_type type) count = e_module->nr_entries; iter = e_module->table; while (count--) { - if (kernel_text_address(iter->code)) + if (iter->key && + kernel_text_address(iter->code)) arch_jump_label_transform(iter, type); iter++; } @@ -366,6 +367,39 @@ static void remove_jump_label_module(struct module *mod) } } +static void remove_jump_label_module_init(struct module *mod) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + struct jump_entry *iter; + int i, count; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + if (e_module->mod != mod) + continue; + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (within_module_init(iter->code, mod)) + iter->key = 0; + iter++; + } + } + } + } +} + static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) @@ -386,6 +420,11 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, remove_jump_label_module(mod); mutex_unlock(&jump_label_mutex); break; + case MODULE_STATE_LIVE: + mutex_lock(&jump_label_mutex); + remove_jump_label_module_init(mod); + mutex_unlock(&jump_label_mutex); + break; } return ret; } -- cgit v1.2.1 From 91bad2f8d3057482b9afb599f14421b007136960 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 1 Oct 2010 17:23:48 -0400 Subject: jump label: Fix deadlock b/w jump_label_mutex vs. text_mutex register_kprobe() downs the 'text_mutex' and then calls jump_label_text_reserved(), which downs the 'jump_label_mutex'. However, the jump label code takes those mutexes in the reverse order. Fix by requiring the caller of jump_label_text_reserved() to do the jump label locking via the newly added: jump_label_lock(), jump_label_unlock(). Currently, kprobes is the only user of jump_label_text_reserved(). Reported-by: Ingo Molnar Acked-by: Masami Hiramatsu Signed-off-by: Jason Baron LKML-Reference: <759032c48d5e30c27f0bba003d09bffa8e9f28bb.1285965957.git.jbaron@redhat.com> Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 33 +++++++++++++++++++++------------ kernel/kprobes.c | 6 ++++++ 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index be9e105345eb..12cce78e9568 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -39,6 +39,16 @@ struct jump_label_module_entry { struct module *mod; }; +void jump_label_lock(void) +{ + mutex_lock(&jump_label_mutex); +} + +void jump_label_unlock(void) +{ + mutex_unlock(&jump_label_mutex); +} + static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type) struct jump_label_module_entry *e_module; int count; - mutex_lock(&jump_label_mutex); + jump_label_lock(); entry = get_jump_label_entry((jump_label_t)key); if (entry) { count = entry->nr_entries; @@ -175,7 +185,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type) } } } - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); } static int addr_conflict(struct jump_entry *entry, void *start, void *end) @@ -232,6 +242,7 @@ out: * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ @@ -242,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end) struct jump_entry *iter_stop = __start___jump_table; int conflict = 0; - mutex_lock(&jump_label_mutex); iter = iter_start; while (iter < iter_stop) { if (addr_conflict(iter, start, end)) { @@ -257,7 +267,6 @@ int jump_label_text_reserved(void *start, void *end) conflict = module_conflict(start, end); #endif out: - mutex_unlock(&jump_label_mutex); return conflict; } @@ -268,7 +277,7 @@ static __init int init_jump_label(void) struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; - mutex_lock(&jump_label_mutex); + jump_label_lock(); ret = build_jump_label_hashtable(__start___jump_table, __stop___jump_table); iter = iter_start; @@ -276,7 +285,7 @@ static __init int init_jump_label(void) arch_jump_label_text_poke_early(iter->code); iter++; } - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); return ret; } early_initcall(init_jump_label); @@ -409,21 +418,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, switch (val) { case MODULE_STATE_COMING: - mutex_lock(&jump_label_mutex); + jump_label_lock(); ret = add_jump_label_module(mod); if (ret) remove_jump_label_module(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); break; case MODULE_STATE_GOING: - mutex_lock(&jump_label_mutex); + jump_label_lock(); remove_jump_label_module(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); break; case MODULE_STATE_LIVE: - mutex_lock(&jump_label_mutex); + jump_label_lock(); remove_jump_label_module_init(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); break; } return ret; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 99865c33a60d..9437e14f36bd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1146,13 +1146,16 @@ int __kprobes register_kprobe(struct kprobe *p) return ret; preempt_disable(); + jump_label_lock(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || jump_label_text_reserved(p->addr, p->addr)) { preempt_enable(); + jump_label_unlock(); return -EINVAL; } + jump_label_unlock(); /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; @@ -1187,6 +1190,8 @@ int __kprobes register_kprobe(struct kprobe *p) INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); + jump_label_lock(); /* needed to call jump_label_text_reserved() */ + get_online_cpus(); /* For avoiding text_mutex deadlock. */ mutex_lock(&text_mutex); @@ -1214,6 +1219,7 @@ int __kprobes register_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); put_online_cpus(); + jump_label_unlock(); mutex_unlock(&kprobe_mutex); if (probed_mod) -- cgit v1.2.1 From de31c3ca8179d7c21def7ecb56e4fec0c8659d36 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 18 Oct 2010 10:38:58 -0400 Subject: jump label: Fix error with preempt disable holding mutex Kprobes and jump label were having a race between mutexes that was fixed by reordering the jump label. But this reordering moved the jump label mutex into a preempt disable location. This patch does a little fiddling to move the grabbing of the jump label mutex from inside the preempt disable section and still keep the order correct between the mutex and the kprobes lock. Reported-by: Ingo Molnar Acked-by: Masami Hiramatsu Cc: Jason Baron Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9437e14f36bd..9737a76e106f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1145,17 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p) if (ret) return ret; - preempt_disable(); jump_label_lock(); + preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || - jump_label_text_reserved(p->addr, p->addr)) { - preempt_enable(); - jump_label_unlock(); - return -EINVAL; - } - jump_label_unlock(); + jump_label_text_reserved(p->addr, p->addr)) + goto fail_with_jump_label; /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; @@ -1169,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p) * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } + if (unlikely(!try_module_get(probed_mod))) + goto fail_with_jump_label; + /* * If the module freed .init.text, we couldn't insert * kprobes in there. @@ -1180,11 +1175,11 @@ int __kprobes register_kprobe(struct kprobe *p) if (within_module_init((unsigned long)p->addr, probed_mod) && probed_mod->state != MODULE_STATE_COMING) { module_put(probed_mod); - preempt_enable(); - return -EINVAL; + goto fail_with_jump_label; } } preempt_enable(); + jump_label_unlock(); p->nmissed = 0; INIT_LIST_HEAD(&p->list); @@ -1226,6 +1221,11 @@ out: module_put(probed_mod); return ret; + +fail_with_jump_label: + preempt_enable(); + jump_label_unlock(); + return -EINVAL; } EXPORT_SYMBOL_GPL(register_kprobe); -- cgit v1.2.1 From 95bcd683fb694a3e2d0538bf486430a0dfbb4111 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Oct 2010 11:02:43 -0400 Subject: jump label: Make arch_jump_label_text_poke_early() optional Some archs do not need to do anything special for jump labels on startup (like MIPS). This patch adds a weak function stub for arch_jump_label_text_poke_early(); Cc: Jason Baron Cc: David Miller Cc: David Daney Suggested-by: Thomas Gleixner LKML-Reference: <1286218615-24011-2-git-send-email-ddaney@caviumnetworks.com> LKML-Reference: <20101015201037.703989993@goodmis.org> Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 12cce78e9568..3b79bd938330 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -270,6 +270,13 @@ out: return conflict; } +/* + * Not all archs need this. + */ +void __weak arch_jump_label_text_poke_early(jump_label_t addr) +{ +} + static __init int init_jump_label(void) { int ret; -- cgit v1.2.1