From 1dd75f91ae713049eb6baaa640078f3a6549e522 Mon Sep 17 00:00:00 2001 From: "jhbird.choi@samsung.com" Date: Thu, 21 Jul 2011 15:29:14 +0900 Subject: genirq: Fix wrong bit operation (!msk & 0x01) should be !(msk & 0x01) Signed-off-by: Jonghwan Choi Link: http://lkml.kernel.org/r/1311229754-6003-1-git-send-email-jhbird.choi@samsung.com Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/irq/generic-chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 3a2cab407b93..e38544dddb18 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -246,7 +246,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); for (i = gc->irq_base; msk; msk >>= 1, i++) { - if (!msk & 0x01) + if (!(msk & 0x01)) continue; if (flags & IRQ_GC_INIT_NESTED_LOCK) @@ -301,7 +301,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, raw_spin_unlock(&gc_lock); for (; msk; msk >>= 1, i++) { - if (!msk & 0x01) + if (!(msk & 0x01)) continue; /* Remove handler first. That will mask the irq line */ -- cgit v1.2.1 From f3637a5f2e2eb391ff5757bc83fb5de8f9726464 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 7 Jul 2011 22:32:17 +0200 Subject: irq: Always set IRQF_ONESHOT if no primary handler is specified If no primary handler is specified then a default one is assigned which always returns IRQ_WAKE_THREAD. This handler requires the IRQF_ONESHOT flag on LEVEL / EIO typed irqs because the source of interrupt is not disabled. Since it is required for those users and there is no difference for others it makes sense to add this flag unconditionally. Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/1310070737-18514-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a7840aeb0fb..3f9cd4799da7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1322,6 +1322,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; + irqflags |= IRQF_ONESHOT; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); -- cgit v1.2.1 From b6873807a7143b7d6d8b06809295e559d07d7deb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 11 Jul 2011 12:17:31 +0200 Subject: irq: Track the owner of irq descriptor Interrupt descriptors can be allocated from modules. The interrupts are used by other modules, but we have no refcount on the module which provides the interrupts and there is no way to establish one on the device level as the interrupt using module is agnostic to the fact that the interrupt is provided by a module rather than by some builtin interrupt controller. To prevent removal of the interrupt providing module, we can track the owner of the interrupt descriptor, which also provides the relevant irq chip functions in the irq descriptor. request/setup_irq() can now acquire a refcount on the owner module to prevent unloading. free_irq() drops the refcount. Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20110711101731.GA13804@Chamillionaire.breakpoint.cc Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 36 ++++++++++++++++++++++++------------ kernel/irq/manage.c | 17 +++++++++++++---- 2 files changed, 37 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4c60a50e66b2..cb65d0360e31 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -70,7 +70,8 @@ static inline void desc_smp_init(struct irq_desc *desc, int node) { } static inline int desc_node(struct irq_desc *desc) { return 0; } #endif -static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) +static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, + struct module *owner) { int cpu; @@ -86,6 +87,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; + desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); @@ -128,7 +130,7 @@ static void free_masks(struct irq_desc *desc) static inline void free_masks(struct irq_desc *desc) { } #endif -static struct irq_desc *alloc_desc(int irq, int node) +static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) { struct irq_desc *desc; gfp_t gfp = GFP_KERNEL; @@ -147,7 +149,7 @@ static struct irq_desc *alloc_desc(int irq, int node) raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); - desc_set_defaults(irq, desc, node); + desc_set_defaults(irq, desc, node, owner); return desc; @@ -173,13 +175,14 @@ static void free_desc(unsigned int irq) kfree(desc); } -static int alloc_descs(unsigned int start, unsigned int cnt, int node) +static int alloc_descs(unsigned int start, unsigned int cnt, int node, + struct module *owner) { struct irq_desc *desc; int i; for (i = 0; i < cnt; i++) { - desc = alloc_desc(start + i, node); + desc = alloc_desc(start + i, node, owner); if (!desc) goto err; mutex_lock(&sparse_irq_lock); @@ -227,7 +230,7 @@ int __init early_irq_init(void) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { - desc = alloc_desc(i, node); + desc = alloc_desc(i, node, NULL); set_bit(i, allocated_irqs); irq_insert_desc(i, desc); } @@ -261,7 +264,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], GFP_KERNEL, node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - desc_set_defaults(i, &desc[i], node); + desc_set_defaults(i, &desc[i], node, NULL); } return arch_early_irq_init(); } @@ -276,8 +279,16 @@ static void free_desc(unsigned int irq) dynamic_irq_cleanup(irq); } -static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) +static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, + struct module *owner) { + u32 i; + + for (i = 0; i < cnt; i++) { + struct irq_desc *desc = irq_to_desc(start + i); + + desc->owner = owner; + } return start; } @@ -337,7 +348,8 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * Returns the first irq number or error code */ int __ref -irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) +__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, + struct module *owner) { int start, ret; @@ -366,13 +378,13 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node); + return alloc_descs(start, cnt, node, owner); err: mutex_unlock(&sparse_irq_lock); return ret; } -EXPORT_SYMBOL_GPL(irq_alloc_descs); +EXPORT_SYMBOL_GPL(__irq_alloc_descs); /** * irq_reserve_irqs - mark irqs allocated @@ -440,7 +452,7 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc)); + desc_set_defaults(irq, desc, desc_node(desc), NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3f9cd4799da7..2e9425889fa8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -883,6 +883,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; + if (!try_module_get(desc->owner)) + return -ENODEV; /* * Some drivers like serial.c use request_irq() heavily, * so we have to be careful not to interfere with a @@ -906,8 +908,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ nested = irq_settings_is_nested_thread(desc); if (nested) { - if (!new->thread_fn) - return -EINVAL; + if (!new->thread_fn) { + ret = -EINVAL; + goto out_mput; + } /* * Replace the primary handler which was provided from * the driver for non nested interrupt handling by the @@ -929,8 +933,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); - if (IS_ERR(t)) - return PTR_ERR(t); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + goto out_mput; + } /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code @@ -1095,6 +1101,8 @@ out_thread: kthread_stop(t); put_task_struct(t); } +out_mput: + module_put(desc->owner); return ret; } @@ -1203,6 +1211,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) put_task_struct(action->thread); } + module_put(desc->owner); return action; } -- cgit v1.2.1 From 80e0401e35410a69bfae05b454db8a7187edd6b8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Aug 2011 14:26:17 +0200 Subject: lockdep: Fix wrong assumption in match_held_lock match_held_lock() was assuming it was being called on a lock class that had already seen usage. This condition was true for bug-free code using lockdep_assert_held(), since you're in fact holding the lock when calling it. However the assumption fails the moment you assume the assertion can fail, which is the whole point of having the assertion in the first place. Anyway, now that there's more lockdep_is_held() users, notably __rcu_dereference_check(), its much easier to trigger this since we test for a number of locks and we only need to hold any one of them to be good. Reported-by: Sergey Senozhatsky Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1312547787.28695.2.camel@twins Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8c24294e477f..91d67ce3a8d5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3111,7 +3111,13 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) if (!class) class = look_up_lock_class(lock, 0); - if (DEBUG_LOCKS_WARN_ON(!class)) + /* + * If look_up_lock_class() failed to find a class, we're trying + * to test if we hold a lock that has never yet been acquired. + * Clearly if the lock hasn't been acquired _ever_, we're not + * holding it either, so report failure. + */ + if (!class) return 0; if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) -- cgit v1.2.1 From 971c90bfa2f0b4fe52d6d9002178d547706f1343 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 4 Aug 2011 07:25:35 -0700 Subject: alarmtimers: Avoid possible null pointer traversal We don't check if old_setting is non null before assigning it, so correct this. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 59f369f98a04..1dee3f62a6a7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -479,11 +479,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, if (!rtcdev) return -ENOTSUPP; - /* Save old values */ - old_setting->it_interval = - ktime_to_timespec(timr->it.alarmtimer.period); - old_setting->it_value = - ktime_to_timespec(timr->it.alarmtimer.node.expires); + if (old_setting) + alarm_timer_get(timr, old_setting); /* If the timer was already set, cancel it */ alarm_cancel(&timr->it.alarmtimer); -- cgit v1.2.1 From ea7802f630d356acaf66b3c0b28c00a945fc35dc Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 4 Aug 2011 07:51:56 -0700 Subject: alarmtimers: Memset itimerspec passed into alarm_timer_get Following common_timer_get, zero out the itimerspec passed in. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1dee3f62a6a7..0e9263f6fd09 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -441,6 +441,8 @@ static int alarm_timer_create(struct k_itimer *new_timer) static void alarm_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { + memset(cur_setting, 0, sizeof(struct itimerspec)); + cur_setting->it_interval = ktime_to_timespec(timr->it.alarmtimer.period); cur_setting->it_value = -- cgit v1.2.1 From 6af7e471e5a7746b8024d70b4363d3dfe41d36b8 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 10:26:09 -0700 Subject: alarmtimers: Avoid possible denial of service with high freq periodic timers Its possible to jam up the alarm timers by setting very small interval timers, which will cause the alarmtimer subsystem to spend all of its time firing and restarting timers. This can effectivly lock up a box. A deeper fix is needed, closely mimicking the hrtimer code, but for now just cap the interval to 100us to avoid userland hanging the system. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0e9263f6fd09..ea5e1a928d5b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -481,6 +481,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, if (!rtcdev) return -ENOTSUPP; + /* + * XXX HACK! Currently we can DOS a system if the interval + * period on alarmtimers is too small. Cap the interval here + * to 100us and solve this properly in a future patch! -jstultz + */ + if ((new_setting->it_interval.tv_sec == 0) && + (new_setting->it_interval.tv_nsec < 100000)) + new_setting->it_interval.tv_nsec = 100000; + if (old_setting) alarm_timer_get(timr, old_setting); -- cgit v1.2.1 From c09c47caedc9854d59378d6e34c989e51cfdd2b4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Aug 2011 10:36:05 +0200 Subject: blktrace: add FLUSH/FUA support Add FLUSH/FUA support to blktrace. As FLUSH precedes WRITE and/or FUA follows WRITE, use the same 'F' flag for both cases and distinguish them by their (relative) position. The end results look like (other flags might be shown also): - WRITE: W - WRITE_FLUSH: FW - WRITE_FUA: WF - WRITE_FLUSH_FUA: FWF Note that we reuse TC_BARRIER due to lack of bit space of act_mask so that the older versions of blktrace tools will report flush requests as barriers from now on. Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Namhyung Kim Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6957aa298dfa..7c910a5593a6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -206,6 +206,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); + what |= MASK_TC_BIT(rw, FLUSH); + what |= MASK_TC_BIT(rw, FUA); pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -1054,6 +1056,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) goto out; } + if (tc & BLK_TC_FLUSH) + rwbs[i++] = 'F'; + if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; else if (tc & BLK_TC_WRITE) @@ -1063,10 +1068,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) else rwbs[i++] = 'N'; + if (tc & BLK_TC_FUA) + rwbs[i++] = 'F'; if (tc & BLK_TC_AHEAD) rwbs[i++] = 'A'; - if (tc & BLK_TC_BARRIER) - rwbs[i++] = 'B'; if (tc & BLK_TC_SYNC) rwbs[i++] = 'S'; if (tc & BLK_TC_META) @@ -1132,7 +1137,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); static int blk_log_action_classic(struct trace_iterator *iter, const char *act) { - char rwbs[6]; + char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; @@ -1148,7 +1153,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act) static int blk_log_action(struct trace_iterator *iter, const char *act) { - char rwbs[6]; + char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); @@ -1561,7 +1566,7 @@ static const struct { } mask_maps[] = { { BLK_TC_READ, "read" }, { BLK_TC_WRITE, "write" }, - { BLK_TC_BARRIER, "barrier" }, + { BLK_TC_FLUSH, "flush" }, { BLK_TC_SYNC, "sync" }, { BLK_TC_QUEUE, "queue" }, { BLK_TC_REQUEUE, "requeue" }, @@ -1573,6 +1578,7 @@ static const struct { { BLK_TC_META, "meta" }, { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, + { BLK_TC_FUA, "fua" }, }; static int blk_trace_str2mask(const char *str) @@ -1788,6 +1794,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) { int i = 0; + if (rw & REQ_FLUSH) + rwbs[i++] = 'F'; + if (rw & WRITE) rwbs[i++] = 'W'; else if (rw & REQ_DISCARD) @@ -1797,6 +1806,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) else rwbs[i++] = 'N'; + if (rw & REQ_FUA) + rwbs[i++] = 'F'; if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; if (rw & REQ_SYNC) -- cgit v1.2.1 From c59d87c460767bc35dafd490139d3cfe78fb8da4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 12 Aug 2011 16:21:35 -0500 Subject: xfs: remove subdirectories Use the move from Linux 2.6 to Linux 3.x as an excuse to kill the annoying subdirectories in the XFS source code. Besides the large amount of file rename the only changes are to the Makefile, a few files including headers with the subdirectory prefix, and the binary sysctl compat code that includes a header under fs/xfs/ from kernel/. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- kernel/sysctl_binary.c | 2 +- kernel/sysctl_check.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 3b8e028b9601..e8bffbe2ba4b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1,6 +1,6 @@ #include #include -#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include "../fs/xfs/xfs_sysctl.h" #include #include #include diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 4e4932a7b360..362da653813d 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -1,6 +1,6 @@ #include #include -#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include "../fs/xfs/xfs_sysctl.h" #include #include #include -- cgit v1.2.1 From 17f2ae7f677f023997e02fd2ebabd90ea2a0390d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 14 Aug 2011 13:34:31 +0200 Subject: PM / Domains: Fix build for CONFIG_PM_RUNTIME unset Function genpd_queue_power_off_work() is not defined for CONFIG_PM_RUNTIME, so pm_genpd_poweroff_unused() causes a build error to happen in that case. Fix the problem by making pm_genpd_poweroff_unused() depend on CONFIG_PM_RUNTIME too. Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index b1914cb9095c..3744c594b19b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -231,3 +231,7 @@ config PM_CLK config PM_GENERIC_DOMAINS bool depends on PM + +config PM_GENERIC_DOMAINS_RUNTIME + def_bool y + depends on PM_RUNTIME && PM_GENERIC_DOMAINS -- cgit v1.2.1 From d522a0d17963e9c2e556db2cbd60c96d40505b6c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 18 Aug 2011 12:19:27 -0700 Subject: irqdesc: fix new kernel-doc warning Fix kernel-doc warning in irqdesc.c: Warning(kernel/irq/irqdesc.c:353): No description found for parameter 'owner' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index cb65d0360e31..039b889ea053 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -344,6 +344,7 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated + * @owner: Owning module (can be NULL) * * Returns the first irq number or error code */ -- cgit v1.2.1 From 69dd3d8e29e294caaf63eb5e8a72d250279f9e5f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 23 Aug 2011 10:36:51 -0700 Subject: Revert "irq: Always set IRQF_ONESHOT if no primary handler is specified" This reverts commit f3637a5f2e2eb391ff5757bc83fb5de8f9726464. It turns out that this breaks several drivers, one example being OMAP boards which use the on-board OMAP UARTs and the omap-serial driver that will not boot to userspace after the commit. Paul Walmsley reports that enabling CONFIG_DEBUG_SHIRQ reveals 'IRQ handler type mismatch' errors: IRQ handler type mismatch for IRQ 74 current handler: serial idle ... and the reason is that setting IRQF_ONESHOT will now result in those interrupt handlers having different IRQF flags, and thus being unsharable. So the commit log in the reverted commit: "Since it is required for those users and there is no difference for others it makes sense to add this flag unconditionally." is simply not true: there may not be any difference from a "actions at irq time", but there is a *big* difference wrt this flag testing irq management (see __setup_irq() in kernel/irq/manage.c). One solution may be to stop verifying IRQF_ONESHOT in __setup_irq(), but right now the safe course of action is to revert the change. Let's revisit this in a later merge window. Reported-by: Paul Walmsley Cc: Sebastian Andrzej Siewior Requested-by: Alan Cox Acked-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2e9425889fa8..9b956fa20308 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1331,7 +1331,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; - irqflags |= IRQF_ONESHOT; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); -- cgit v1.2.1 From be27425dcc516fd08245b047ea57f83b8f6f0903 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 19 Aug 2011 16:15:10 -0700 Subject: Add a personality to report 2.6.x version numbers I ran into a couple of programs which broke with the new Linux 3.0 version. Some of those were binary only. I tried to use LD_PRELOAD to work around it, but it was quite difficult and in one case impossible because of a mix of 32bit and 64bit executables. For example, all kind of management software from HP doesnt work, unless we pretend to run a 2.6 kernel. $ uname -a Linux svivoipvnx001 3.0.0-08107-g97cd98f #1062 SMP Fri Aug 12 18:11:45 CEST 2011 i686 i686 i386 GNU/Linux $ hpacucli ctrl all show Error: No controllers detected. $ rpm -qf /usr/sbin/hpacucli hpacucli-8.75-12.0 Another notable case is that Python now reports "linux3" from sys.platform(); which in turn can break things that were checking sys.platform() == "linux2": https://bugzilla.mozilla.org/show_bug.cgi?id=664564 It seems pretty clear to me though it's a bug in the apps that are using '==' instead of .startswith(), but this allows us to unbreak broken programs. This patch adds a UNAME26 personality that makes the kernel report a 2.6.40+x version number instead. The x is the x in 3.x. I know this is somewhat ugly, but I didn't find a better workaround, and compatibility to existing programs is important. Some programs also read /proc/sys/kernel/osrelease. This can be worked around in user space with mount --bind (and a mount namespace) To use: wget ftp://ftp.kernel.org/pub/linux/kernel/people/ak/uname26/uname26.c gcc -o uname26 uname26.c ./uname26 program Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sys.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index dd948a1fca4c..18ee1d2f6474 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -44,6 +46,8 @@ #include #include +/* Move somewhere else to avoid recompiling? */ +#include #include #include @@ -1161,6 +1165,34 @@ DECLARE_RWSEM(uts_sem); #define override_architecture(name) 0 #endif +/* + * Work around broken programs that cannot handle "Linux 3.0". + * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + */ +static int override_release(char __user *release, int len) +{ + int ret = 0; + char buf[len]; + + if (current->personality & UNAME26) { + char *rest = UTS_RELEASE; + int ndots = 0; + unsigned v; + + while (*rest) { + if (*rest == '.' && ++ndots >= 3) + break; + if (!isdigit(*rest) && *rest != '.') + break; + rest++; + } + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + snprintf(buf, len, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, len); + } + return ret; +} + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1170,6 +1202,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) errno = -EFAULT; up_read(&uts_sem); + if (!errno && override_release(name->release, sizeof(name->release))) + errno = -EFAULT; if (!errno && override_architecture(name)) errno = -EFAULT; return errno; @@ -1191,6 +1225,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) error = -EFAULT; up_read(&uts_sem); + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; if (!error && override_architecture(name)) error = -EFAULT; return error; @@ -1225,6 +1261,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) if (!error && override_architecture(name)) error = -EFAULT; + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; return error ? -EFAULT : 0; } #endif -- cgit v1.2.1 From 4c30c6f566c0989ddaee3407da44751e340a63ed Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 25 Aug 2011 15:59:11 -0700 Subject: kernel/printk: do not turn off bootconsole in printk_late_init() if keep_bootcon It seems that 7bf693951a8e ("console: allow to retain boot console via boot option keep_bootcon") doesn't always achieve what it aims, as when printk_late_init() runs it unconditionally turns off all boot consoles. With this patch, I am able to see more messages on the boot console in KVM guests than I can without, when keep_bootcon is specified. I think it is appropriate for the relevant -stable trees. However, it's more of an annoyance than a serious bug (ideally you don't need to keep the boot console around as console handover should be working -- I was encountering a situation where the console handover wasn't working and not having the boot console available meant I couldn't see why). Signed-off-by: Nishanth Aravamudan Cc: David S. Miller Cc: Alan Cox Cc: Greg KH Acked-by: Fabio M. Di Nitto Cc: [2.6.39.x, 3.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 836a2ae0ac31..28a40d8171b8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1604,7 +1604,7 @@ static int __init printk_late_init(void) struct console *con; for_each_console(con) { - if (con->flags & CON_BOOT) { + if (!keep_bootcon && con->flags & CON_BOOT) { printk(KERN_INFO "turn off boot console %s%d\n", con->name, con->index); unregister_console(con); -- cgit v1.2.1 From f5b940997397229975ea073679b03967932a541b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Aug 2011 18:03:11 -0400 Subject: All Arch: remove linkage for sys_nfsservctl system call The nfsservctl system call is now gone, so we should remove all linkage for it. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 62cbc8877fef..a9a5de07c4f1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -cond_syscall(sys_nfsservctl); cond_syscall(sys_quotactl); cond_syscall(sys32_quotactl); cond_syscall(sys_acct); -- cgit v1.2.1 From c259e01a1ec90063042f758e409cd26b2a0963c8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jun 2011 19:47:00 +0200 Subject: sched: Separate the scheduler entry for preemption Block-IO and workqueues call into notifier functions from the scheduler core code with interrupts and preemption disabled. These calls should be made before entering the scheduler core. To simplify this, separate the scheduler core code into __schedule(). __schedule() is directly called from the places which set PREEMPT_ACTIVE and from schedule(). This allows us to add the work checks into schedule(), so they are only called when a task voluntary goes to sleep. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Jens Axboe Cc: Linus Torvalds Cc: stable@kernel.org # 2.6.39+ Link: http://lkml.kernel.org/r/20110622174918.813258321@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..ec15e8129cf7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4279,9 +4279,9 @@ pick_next_task(struct rq *rq) } /* - * schedule() is the main scheduler function. + * __schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +static void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -4369,6 +4369,11 @@ need_resched: if (need_resched()) goto need_resched; } + +asmlinkage void schedule(void) +{ + __schedule(); +} EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER @@ -4435,7 +4440,7 @@ asmlinkage void __sched notrace preempt_schedule(void) do { add_preempt_count_notrace(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count_notrace(PREEMPT_ACTIVE); /* @@ -4463,7 +4468,7 @@ asmlinkage void __sched preempt_schedule_irq(void) do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); - schedule(); + __schedule(); local_irq_disable(); sub_preempt_count(PREEMPT_ACTIVE); @@ -5588,7 +5593,7 @@ static inline int should_resched(void) static void __cond_resched(void) { add_preempt_count(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count(PREEMPT_ACTIVE); } -- cgit v1.2.1 From 9c40cef2b799f9b5e7fa5de4d2ad3a0168ba118c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jun 2011 19:47:01 +0200 Subject: sched: Move blk_schedule_flush_plug() out of __schedule() There is no real reason to run blk_schedule_flush_plug() with interrupts and preemption disabled. Move it into schedule() and call it when the task is going voluntarily to sleep. There might be false positives when the task is woken between that call and actually scheduling, but that's not really different from being woken immediately after switching away. This fixes a deadlock in the scheduler where the blk_schedule_flush_plug() callchain enables interrupts and thereby allows a wakeup to happen of the task that's going to sleep. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Jens Axboe Cc: Linus Torvalds Cc: stable@kernel.org # 2.6.39+ Link: http://lkml.kernel.org/n/tip-dwfxtra7yg1b5r65m32ywtct@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ec15e8129cf7..511732c39b6e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4322,16 +4322,6 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } - - /* - * If we are going to sleep and we have plugged IO - * queued, make sure to submit it to avoid deadlocks. - */ - if (blk_needs_flush_plug(prev)) { - raw_spin_unlock(&rq->lock); - blk_schedule_flush_plug(prev); - raw_spin_lock(&rq->lock); - } } switch_count = &prev->nvcsw; } @@ -4370,8 +4360,23 @@ need_resched: goto need_resched; } +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state) + return; + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} + asmlinkage void schedule(void) { + struct task_struct *tsk = current; + + sched_submit_work(tsk); __schedule(); } EXPORT_SYMBOL(schedule); -- cgit v1.2.1 From feff8fa0075bdfd43c841e9d689ed81adda988d6 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 18 Aug 2011 20:36:57 +0800 Subject: sched: Fix a memory leak in __sdt_free() This patch fixes the following memory leak: unreferenced object 0xffff880107266800 (size 512): comm "sched-powersave", pid 3718, jiffies 4323097853 (age 27495.450s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] create_object+0x187/0x28b [] kmemleak_alloc+0x73/0x98 [] __kmalloc_node+0x104/0x159 [] kzalloc_node.clone.97+0x15/0x17 [] build_sched_domains+0xb7/0x7f3 [] partition_sched_domains+0x1db/0x24a [] do_rebuild_sched_domains+0x3b/0x47 [] rebuild_sched_domains+0x10/0x12 [] sched_power_savings_store+0x6c/0x7b [] sched_mc_power_savings_store+0x16/0x18 [] sysdev_class_store+0x20/0x22 [] sysfs_write_file+0x108/0x144 [] vfs_write+0xaf/0x102 [] sys_write+0x4d/0x74 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff Signed-off-by: WANG Cong Signed-off-by: Peter Zijlstra Cc: stable@kernel.org # 3.0 Link: http://lkml.kernel.org/r/1313671017-4112-1-git-send-email-amwang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 511732c39b6e..c79e7c63a4aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7453,6 +7453,7 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); if (sd && (sd->flags & SD_OVERLAP)) free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); kfree(*per_cpu_ptr(sdd->sg, j)); kfree(*per_cpu_ptr(sdd->sgp, j)); } -- cgit v1.2.1 From a8d757ef076f0f95f13a918808824058de25b3eb Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 25 Aug 2011 15:58:03 +0200 Subject: perf events: Fix slow and broken cgroup context switch code The current cgroup context switch code was incorrect leading to bogus counts. Furthermore, as soon as there was an active cgroup event on a CPU, the context switch cost on that CPU would increase by a significant amount as demonstrated by a simple ping/pong example: $ ./pong Both processes pinned to CPU1, running for 10s 10684.51 ctxsw/s Now start a cgroup perf stat: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 $ ./pong Both processes pinned to CPU1, running for 10s 6674.61 ctxsw/s That's a 37% penalty. Note that pong is not even in the monitored cgroup. The results shown by perf stat are bogus: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 Performance counter stats for 'sleep 100': CPU1 cycles test CPU1 16,984,189,138 cycles # 0.000 GHz The second 'cycles' event should report a count @ CPU clock (here 2.4GHz) as it is counting across all cgroups. The patch below fixes the bogus accounting and bypasses any cgroup switches in case the outgoing and incoming tasks are in the same cgroup. With this patch the same test now yields: $ ./pong Both processes pinned to CPU1, running for 10s 10775.30 ctxsw/s Start perf stat with cgroup: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Run pong outside the cgroup: $ /pong Both processes pinned to CPU1, running for 10s 10687.80 ctxsw/s The penalty is now less than 2%. And the results for perf stat are correct: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz Now perf stat reports the correct counts for for the non cgroup event. If we run pong inside the cgroup, then we also get the correct counts: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 22,297,726,205 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz 10.001457237 seconds time elapsed Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 63 +++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched.c | 2 +- 2 files changed, 54 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b8785e26ee1c..45847fbb599a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -399,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* + * next is NULL when called from perf_event_enable_on_exec() + * that will systematically cause a cgroup_switch() + */ + if (next) + cgrp2 = perf_cgroup_from_task(next); + + /* + * only schedule out current cgroup events if we know + * that we are switching to a different cgroup. Otherwise, + * do no touch the cgroup events. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { - perf_cgroup_switch(task, PERF_CGROUP_SWIN); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* prev can never be NULL */ + cgrp2 = perf_cgroup_from_task(prev); + + /* + * only need to schedule in cgroup events if we are changing + * cgroup during ctxsw. Cgroup events were not scheduled + * out of ctxsw out if that was not the case. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWIN); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -518,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { } @@ -1988,7 +2030,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_out(task); + perf_cgroup_sched_out(task, next); } static void task_ctx_sched_out(struct perf_event_context *ctx) @@ -2153,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void __perf_event_task_sched_in(struct task_struct *task) +void __perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task) { struct perf_event_context *ctx; int ctxn; @@ -2171,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task) * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_in(task); + perf_cgroup_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2427,7 +2470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * ctxswin cgroup events which are already scheduled * in. */ - perf_cgroup_sched_out(current); + perf_cgroup_sched_out(current, NULL); raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..0408cdc6d572 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3065,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_disable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - perf_event_task_sched_in(current); + perf_event_task_sched_in(prev, current); #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -- cgit v1.2.1 From 7f310a5d4e8525ac0cc2f58c973d2100ce034410 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 23 Jun 2011 16:34:38 -0400 Subject: perf_event: Fix broken calc_timer_values() We detected a serious issue with PERF_SAMPLE_READ and timing information when events were being multiplexing. Samples would have time_running > time_enabled. That was easy to reproduce with a libpfm4 example (ran 3 times to cause multiplexing on Core 2): $ syst_smpl -e uops_retired:freq=1 & $ syst_smpl -e uops_retired:freq=1 & $ syst_smpl -e uops_retired:freq=1 & IIP:0x0000000040062d ... PERIOD:2355332948 ENA=40144625315 RUN=60014875184 syst_smpl: WARNING: time_running > time_enabled 63277537998 uops_retired:freq=1 , scaled The bug was not present in kernel up to (and including) 3.0. It turns out the bug was introduced by the following commit: commit c4794295917ebeda8013b6cb9c8d71ab4f74a1fa events: Move lockless timer calculation into helper function The parameters of the function got reversed yet the call sites were not updated to reflect the change. That lead to time_running and time_enabled being swapped. That had no effect when there was no multiplexing because in that case time_running = time_enabled but it would show up in any other scenario. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110829124112.GA4828@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 45847fbb599a..0f857782d06f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3396,8 +3396,8 @@ static int perf_event_index(struct perf_event *event) } static void calc_timer_values(struct perf_event *event, - u64 *running, - u64 *enabled) + u64 *enabled, + u64 *running) { u64 now, ctx_time; -- cgit v1.2.1